All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
@ 2023-03-16  3:59 ` Ashutosh Dixit
  0 siblings, 0 replies; 25+ messages in thread
From: Ashutosh Dixit @ 2023-03-16  3:59 UTC (permalink / raw)
  To: intel-gfx
  Cc: dri-devel, Badal Nilawar, Rodrigo Vivi, Vinay Belgaumkar, John Harrison

On dGfx, the PL1 power limit being enabled and set to a low value results
in a low GPU operating freq. It also negates the freq raise operation which
is done before GuC firmware load. As a result GuC firmware load can time
out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
limit was enabled and set to a low value). Therefore disable the PL1 power
limit when allowed by HW when loading GuC firmware.

v2:
 - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
 - Add hwm_power_max_restore to error return code path

v3 (Jani N):
 - Add/remove explanatory comments
 - Function renames
 - Type corrections
 - Locking annotation

Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
 drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
 3 files changed, 55 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
index 4ccb4be4c9cba..aa8e35a5636a0 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -18,6 +18,7 @@
 #include "intel_uc.h"
 
 #include "i915_drv.h"
+#include "i915_hwmon.h"
 
 static const struct intel_uc_ops uc_ops_off;
 static const struct intel_uc_ops uc_ops_on;
@@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
 	struct intel_guc *guc = &uc->guc;
 	struct intel_huc *huc = &uc->huc;
 	int ret, attempts;
+	bool pl1en;
 
 	GEM_BUG_ON(!intel_uc_supports_guc(uc));
 	GEM_BUG_ON(!intel_uc_wants_guc(uc));
@@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
 	else
 		attempts = 1;
 
+	/* Disable a potentially low PL1 power limit to allow freq to be raised */
+	i915_hwmon_power_max_disable(gt->i915, &pl1en);
+
 	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
 
 	while (attempts--) {
@@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
 		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
 	}
 
+	i915_hwmon_power_max_restore(gt->i915, pl1en);
+
 	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
 	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
 
@@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
 	/* Return GT back to RPn */
 	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
 
+	i915_hwmon_power_max_restore(gt->i915, pl1en);
+
 	__uc_sanitize(uc);
 
 	if (!ret) {
diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
index ee63a8fd88fc1..769b5bda4d53f 100644
--- a/drivers/gpu/drm/i915/i915_hwmon.c
+++ b/drivers/gpu/drm/i915/i915_hwmon.c
@@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
 	}
 }
 
+void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old)
+	__acquires(i915->hwmon->hwmon_lock)
+{
+	struct i915_hwmon *hwmon = i915->hwmon;
+	intel_wakeref_t wakeref;
+	u32 r;
+
+	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
+		return;
+
+	/* Take mutex to prevent concurrent hwm_power_max_write */
+	mutex_lock(&hwmon->hwmon_lock);
+
+	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
+		r = intel_uncore_rmw(hwmon->ddat.uncore,
+				     hwmon->rg.pkg_rapl_limit,
+				     PKG_PWR_LIM_1_EN, 0);
+
+	*old = !!(r & PKG_PWR_LIM_1_EN);
+}
+
+void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
+	__releases(i915->hwmon->hwmon_lock)
+{
+	struct i915_hwmon *hwmon = i915->hwmon;
+	intel_wakeref_t wakeref;
+
+	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
+		return;
+
+	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
+		intel_uncore_rmw(hwmon->ddat.uncore,
+				 hwmon->rg.pkg_rapl_limit,
+				 PKG_PWR_LIM_1_EN,
+				 old ? PKG_PWR_LIM_1_EN : 0);
+
+	mutex_unlock(&hwmon->hwmon_lock);
+}
+
 static umode_t
 hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
 {
diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
index 7ca9cf2c34c96..0fcb7de844061 100644
--- a/drivers/gpu/drm/i915/i915_hwmon.h
+++ b/drivers/gpu/drm/i915/i915_hwmon.h
@@ -7,14 +7,21 @@
 #ifndef __I915_HWMON_H__
 #define __I915_HWMON_H__
 
+#include <linux/types.h>
+
 struct drm_i915_private;
+struct intel_gt;
 
 #if IS_REACHABLE(CONFIG_HWMON)
 void i915_hwmon_register(struct drm_i915_private *i915);
 void i915_hwmon_unregister(struct drm_i915_private *i915);
+void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
+void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
 #else
 static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
 static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
+static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
+static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
 #endif
 
 #endif /* __I915_HWMON_H__ */
-- 
2.38.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
@ 2023-03-16  3:59 ` Ashutosh Dixit
  0 siblings, 0 replies; 25+ messages in thread
From: Ashutosh Dixit @ 2023-03-16  3:59 UTC (permalink / raw)
  To: intel-gfx; +Cc: dri-devel, Rodrigo Vivi

On dGfx, the PL1 power limit being enabled and set to a low value results
in a low GPU operating freq. It also negates the freq raise operation which
is done before GuC firmware load. As a result GuC firmware load can time
out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
limit was enabled and set to a low value). Therefore disable the PL1 power
limit when allowed by HW when loading GuC firmware.

v2:
 - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
 - Add hwm_power_max_restore to error return code path

v3 (Jani N):
 - Add/remove explanatory comments
 - Function renames
 - Type corrections
 - Locking annotation

Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
 drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
 3 files changed, 55 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
index 4ccb4be4c9cba..aa8e35a5636a0 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -18,6 +18,7 @@
 #include "intel_uc.h"
 
 #include "i915_drv.h"
+#include "i915_hwmon.h"
 
 static const struct intel_uc_ops uc_ops_off;
 static const struct intel_uc_ops uc_ops_on;
@@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
 	struct intel_guc *guc = &uc->guc;
 	struct intel_huc *huc = &uc->huc;
 	int ret, attempts;
+	bool pl1en;
 
 	GEM_BUG_ON(!intel_uc_supports_guc(uc));
 	GEM_BUG_ON(!intel_uc_wants_guc(uc));
@@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
 	else
 		attempts = 1;
 
+	/* Disable a potentially low PL1 power limit to allow freq to be raised */
+	i915_hwmon_power_max_disable(gt->i915, &pl1en);
+
 	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
 
 	while (attempts--) {
@@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
 		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
 	}
 
+	i915_hwmon_power_max_restore(gt->i915, pl1en);
+
 	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
 	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
 
@@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
 	/* Return GT back to RPn */
 	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
 
+	i915_hwmon_power_max_restore(gt->i915, pl1en);
+
 	__uc_sanitize(uc);
 
 	if (!ret) {
diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
index ee63a8fd88fc1..769b5bda4d53f 100644
--- a/drivers/gpu/drm/i915/i915_hwmon.c
+++ b/drivers/gpu/drm/i915/i915_hwmon.c
@@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
 	}
 }
 
+void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old)
+	__acquires(i915->hwmon->hwmon_lock)
+{
+	struct i915_hwmon *hwmon = i915->hwmon;
+	intel_wakeref_t wakeref;
+	u32 r;
+
+	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
+		return;
+
+	/* Take mutex to prevent concurrent hwm_power_max_write */
+	mutex_lock(&hwmon->hwmon_lock);
+
+	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
+		r = intel_uncore_rmw(hwmon->ddat.uncore,
+				     hwmon->rg.pkg_rapl_limit,
+				     PKG_PWR_LIM_1_EN, 0);
+
+	*old = !!(r & PKG_PWR_LIM_1_EN);
+}
+
+void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
+	__releases(i915->hwmon->hwmon_lock)
+{
+	struct i915_hwmon *hwmon = i915->hwmon;
+	intel_wakeref_t wakeref;
+
+	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
+		return;
+
+	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
+		intel_uncore_rmw(hwmon->ddat.uncore,
+				 hwmon->rg.pkg_rapl_limit,
+				 PKG_PWR_LIM_1_EN,
+				 old ? PKG_PWR_LIM_1_EN : 0);
+
+	mutex_unlock(&hwmon->hwmon_lock);
+}
+
 static umode_t
 hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
 {
diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
index 7ca9cf2c34c96..0fcb7de844061 100644
--- a/drivers/gpu/drm/i915/i915_hwmon.h
+++ b/drivers/gpu/drm/i915/i915_hwmon.h
@@ -7,14 +7,21 @@
 #ifndef __I915_HWMON_H__
 #define __I915_HWMON_H__
 
+#include <linux/types.h>
+
 struct drm_i915_private;
+struct intel_gt;
 
 #if IS_REACHABLE(CONFIG_HWMON)
 void i915_hwmon_register(struct drm_i915_private *i915);
 void i915_hwmon_unregister(struct drm_i915_private *i915);
+void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
+void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
 #else
 static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
 static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
+static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
+static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
 #endif
 
 #endif /* __I915_HWMON_H__ */
-- 
2.38.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [Intel-gfx] ✗ Fi.CI.SPARSE: warning for drm/i915/guc: Disable PL1 power limit when loading GuC firmware (rev3)
  2023-03-16  3:59 ` [Intel-gfx] " Ashutosh Dixit
  (?)
@ 2023-03-16  4:13 ` Patchwork
  -1 siblings, 0 replies; 25+ messages in thread
From: Patchwork @ 2023-03-16  4:13 UTC (permalink / raw)
  To: Dixit, Ashutosh; +Cc: intel-gfx

== Series Details ==

Series: drm/i915/guc: Disable PL1 power limit when loading GuC firmware (rev3)
URL   : https://patchwork.freedesktop.org/series/115003/
State : warning

== Summary ==

Error: dim sparse failed
Sparse version: v0.6.2
Fast mode used, each commit won't be checked separately.
+drivers/gpu/drm/i915/i915_hwmon.c:447:6: warning: context imbalance in 'i915_hwmon_power_max_disable' - wrong count at exit
+drivers/gpu/drm/i915/i915_hwmon.c:468:6: warning: context imbalance in 'i915_hwmon_power_max_restore' - wrong count at exit



^ permalink raw reply	[flat|nested] 25+ messages in thread

* [Intel-gfx] ✓ Fi.CI.BAT: success for drm/i915/guc: Disable PL1 power limit when loading GuC firmware (rev3)
  2023-03-16  3:59 ` [Intel-gfx] " Ashutosh Dixit
  (?)
  (?)
@ 2023-03-16  4:37 ` Patchwork
  -1 siblings, 0 replies; 25+ messages in thread
From: Patchwork @ 2023-03-16  4:37 UTC (permalink / raw)
  To: Dixit, Ashutosh; +Cc: intel-gfx

[-- Attachment #1: Type: text/plain, Size: 4170 bytes --]

== Series Details ==

Series: drm/i915/guc: Disable PL1 power limit when loading GuC firmware (rev3)
URL   : https://patchwork.freedesktop.org/series/115003/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_12867 -> Patchwork_115003v3
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/index.html

Participating hosts (38 -> 37)
------------------------------

  Missing    (1): fi-snb-2520m 

Known issues
------------

  Here are the changes found in Patchwork_115003v3 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@i915_selftest@live@execlists:
    - fi-bsw-n3050:       [PASS][1] -> [INCOMPLETE][2] ([i915#6972])
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/fi-bsw-n3050/igt@i915_selftest@live@execlists.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/fi-bsw-n3050/igt@i915_selftest@live@execlists.html

  * igt@i915_selftest@live@migrate:
    - bat-dg2-11:         [PASS][3] -> [DMESG-WARN][4] ([i915#7699])
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/bat-dg2-11/igt@i915_selftest@live@migrate.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/bat-dg2-11/igt@i915_selftest@live@migrate.html

  * igt@i915_selftest@live@reset:
    - bat-rpls-1:         [PASS][5] -> [ABORT][6] ([i915#4983])
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/bat-rpls-1/igt@i915_selftest@live@reset.html
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/bat-rpls-1/igt@i915_selftest@live@reset.html

  * igt@kms_chamelium_hpd@common-hpd-after-suspend:
    - fi-bsw-nick:        NOTRUN -> [SKIP][7] ([fdo#109271]) +1 similar issue
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/fi-bsw-nick/igt@kms_chamelium_hpd@common-hpd-after-suspend.html

  
#### Possible fixes ####

  * igt@i915_selftest@live@execlists:
    - fi-bsw-nick:        [ABORT][8] ([i915#7911] / [i915#7913]) -> [PASS][9]
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/fi-bsw-nick/igt@i915_selftest@live@execlists.html
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/fi-bsw-nick/igt@i915_selftest@live@execlists.html

  * igt@i915_selftest@live@gt_heartbeat:
    - fi-kbl-soraka:      [DMESG-FAIL][10] ([i915#5334] / [i915#7872]) -> [PASS][11]
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/fi-kbl-soraka/igt@i915_selftest@live@gt_heartbeat.html
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/fi-kbl-soraka/igt@i915_selftest@live@gt_heartbeat.html
    - bat-jsl-3:          [DMESG-FAIL][12] ([i915#5334]) -> [PASS][13]
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/bat-jsl-3/igt@i915_selftest@live@gt_heartbeat.html
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/bat-jsl-3/igt@i915_selftest@live@gt_heartbeat.html

  
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [i915#4983]: https://gitlab.freedesktop.org/drm/intel/issues/4983
  [i915#5334]: https://gitlab.freedesktop.org/drm/intel/issues/5334
  [i915#6972]: https://gitlab.freedesktop.org/drm/intel/issues/6972
  [i915#7699]: https://gitlab.freedesktop.org/drm/intel/issues/7699
  [i915#7872]: https://gitlab.freedesktop.org/drm/intel/issues/7872
  [i915#7911]: https://gitlab.freedesktop.org/drm/intel/issues/7911
  [i915#7913]: https://gitlab.freedesktop.org/drm/intel/issues/7913


Build changes
-------------

  * Linux: CI_DRM_12867 -> Patchwork_115003v3

  CI-20190529: 20190529
  CI_DRM_12867: 67d4276f8342780b8eaa6e9f5c15d979254a5675 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_7196: 9b8c5dbe8cd82163ee198c43b81222d2b9b75fd4 @ https://gitlab.freedesktop.org/drm/igt-gpu-tools.git
  Patchwork_115003v3: 67d4276f8342780b8eaa6e9f5c15d979254a5675 @ git://anongit.freedesktop.org/gfx-ci/linux


### Linux commits

189522cdd0e1 drm/i915/guc: Disable PL1 power limit when loading GuC firmware

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/index.html

[-- Attachment #2: Type: text/html, Size: 5000 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [Intel-gfx] ✓ Fi.CI.IGT: success for drm/i915/guc: Disable PL1 power limit when loading GuC firmware (rev3)
  2023-03-16  3:59 ` [Intel-gfx] " Ashutosh Dixit
                   ` (2 preceding siblings ...)
  (?)
@ 2023-03-16  9:05 ` Patchwork
  -1 siblings, 0 replies; 25+ messages in thread
From: Patchwork @ 2023-03-16  9:05 UTC (permalink / raw)
  To: Dixit, Ashutosh; +Cc: intel-gfx

[-- Attachment #1: Type: text/plain, Size: 19426 bytes --]

== Series Details ==

Series: drm/i915/guc: Disable PL1 power limit when loading GuC firmware (rev3)
URL   : https://patchwork.freedesktop.org/series/115003/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_12867_full -> Patchwork_115003v3_full
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  

Participating hosts (8 -> 8)
------------------------------

  No changes in participating hosts

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_115003v3_full:

### IGT changes ###

#### Suppressed ####

  The following results come from untrusted machines, tests, or statuses.
  They do not affect the overall result.

  * igt@kms_cursor_legacy@single-move@all-pipes:
    - {shard-rkl}:        [PASS][1] -> [ABORT][2]
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-rkl-2/igt@kms_cursor_legacy@single-move@all-pipes.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-rkl-3/igt@kms_cursor_legacy@single-move@all-pipes.html

  
Known issues
------------

  Here are the changes found in Patchwork_115003v3_full that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@gem_exec_fair@basic-throttle@rcs0:
    - shard-glk:          [PASS][3] -> [FAIL][4] ([i915#2842])
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-glk4/igt@gem_exec_fair@basic-throttle@rcs0.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-glk6/igt@gem_exec_fair@basic-throttle@rcs0.html

  * igt@gem_lmem_swapping@heavy-multi:
    - shard-glk:          NOTRUN -> [SKIP][5] ([fdo#109271] / [i915#4613])
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-glk1/igt@gem_lmem_swapping@heavy-multi.html

  * igt@gem_mmap_gtt@fault-concurrent-x:
    - shard-snb:          [PASS][6] -> [ABORT][7] ([i915#5161])
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-snb4/igt@gem_mmap_gtt@fault-concurrent-x.html
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-snb2/igt@gem_mmap_gtt@fault-concurrent-x.html

  * igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions:
    - shard-glk:          [PASS][8] -> [FAIL][9] ([i915#2346])
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-glk1/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions.html
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-glk6/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions.html

  * igt@kms_frontbuffer_tracking@psr-1p-primscrn-pri-indfb-draw-mmap-gtt:
    - shard-glk:          NOTRUN -> [SKIP][10] ([fdo#109271]) +36 similar issues
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-glk1/igt@kms_frontbuffer_tracking@psr-1p-primscrn-pri-indfb-draw-mmap-gtt.html

  * igt@kms_psr2_sf@cursor-plane-update-sf:
    - shard-glk:          NOTRUN -> [SKIP][11] ([fdo#109271] / [i915#658])
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-glk1/igt@kms_psr2_sf@cursor-plane-update-sf.html

  * igt@kms_vblank@pipe-c-ts-continuation-suspend:
    - shard-apl:          [PASS][12] -> [ABORT][13] ([i915#180])
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-apl7/igt@kms_vblank@pipe-c-ts-continuation-suspend.html
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-apl4/igt@kms_vblank@pipe-c-ts-continuation-suspend.html

  
#### Possible fixes ####

  * igt@gem_exec_balancer@fairslice:
    - {shard-rkl}:        [SKIP][14] ([i915#6259]) -> [PASS][15]
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-rkl-5/igt@gem_exec_balancer@fairslice.html
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-rkl-1/igt@gem_exec_balancer@fairslice.html

  * igt@gem_exec_reloc@basic-cpu-gtt-noreloc:
    - {shard-rkl}:        [SKIP][16] ([i915#3281]) -> [PASS][17] +5 similar issues
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-rkl-1/igt@gem_exec_reloc@basic-cpu-gtt-noreloc.html
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-rkl-5/igt@gem_exec_reloc@basic-cpu-gtt-noreloc.html

  * igt@gem_pwrite@basic-random:
    - {shard-rkl}:        [SKIP][18] ([i915#3282]) -> [PASS][19]
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-rkl-1/igt@gem_pwrite@basic-random.html
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-rkl-5/igt@gem_pwrite@basic-random.html

  * igt@gen9_exec_parse@allowed-all:
    - shard-glk:          [ABORT][20] ([i915#5566]) -> [PASS][21]
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-glk2/igt@gen9_exec_parse@allowed-all.html
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-glk1/igt@gen9_exec_parse@allowed-all.html

  * igt@i915_pm_rpm@modeset-lpsp:
    - {shard-rkl}:        [SKIP][22] ([i915#1397]) -> [PASS][23]
   [22]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-rkl-1/igt@i915_pm_rpm@modeset-lpsp.html
   [23]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-rkl-6/igt@i915_pm_rpm@modeset-lpsp.html

  * igt@kms_big_fb@x-tiled-32bpp-rotate-0:
    - {shard-rkl}:        [SKIP][24] ([i915#1845] / [i915#4098]) -> [PASS][25] +17 similar issues
   [24]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-rkl-1/igt@kms_big_fb@x-tiled-32bpp-rotate-0.html
   [25]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-rkl-6/igt@kms_big_fb@x-tiled-32bpp-rotate-0.html

  * igt@kms_big_fb@y-tiled-max-hw-stride-64bpp-rotate-0-hflip:
    - {shard-tglu}:       [SKIP][26] ([i915#1845]) -> [PASS][27] +12 similar issues
   [26]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-tglu-10/igt@kms_big_fb@y-tiled-max-hw-stride-64bpp-rotate-0-hflip.html
   [27]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-tglu-8/igt@kms_big_fb@y-tiled-max-hw-stride-64bpp-rotate-0-hflip.html

  * igt@kms_fence_pin_leak:
    - {shard-tglu}:       [SKIP][28] ([fdo#109274] / [i915#1845]) -> [PASS][29]
   [28]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-tglu-10/igt@kms_fence_pin_leak.html
   [29]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-tglu-8/igt@kms_fence_pin_leak.html

  * igt@kms_flip@flip-vs-expired-vblank-interruptible@a-hdmi-a1:
    - shard-glk:          [FAIL][30] ([i915#2122]) -> [PASS][31]
   [30]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-glk6/igt@kms_flip@flip-vs-expired-vblank-interruptible@a-hdmi-a1.html
   [31]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-glk8/igt@kms_flip@flip-vs-expired-vblank-interruptible@a-hdmi-a1.html

  * igt@kms_frontbuffer_tracking@fbc-1p-primscrn-cur-indfb-draw-pwrite:
    - {shard-rkl}:        [SKIP][32] ([i915#1849] / [i915#4098]) -> [PASS][33] +7 similar issues
   [32]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-rkl-1/igt@kms_frontbuffer_tracking@fbc-1p-primscrn-cur-indfb-draw-pwrite.html
   [33]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-rkl-6/igt@kms_frontbuffer_tracking@fbc-1p-primscrn-cur-indfb-draw-pwrite.html

  * igt@kms_frontbuffer_tracking@fbc-1p-rte:
    - {shard-tglu}:       [SKIP][34] ([i915#1849]) -> [PASS][35] +2 similar issues
   [34]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-tglu-10/igt@kms_frontbuffer_tracking@fbc-1p-rte.html
   [35]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-tglu-8/igt@kms_frontbuffer_tracking@fbc-1p-rte.html

  * {igt@kms_plane@invalid-pixel-format-settings}:
    - {shard-rkl}:        [SKIP][36] ([i915#8152]) -> [PASS][37]
   [36]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-rkl-1/igt@kms_plane@invalid-pixel-format-settings.html
   [37]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-rkl-6/igt@kms_plane@invalid-pixel-format-settings.html

  * igt@kms_plane@plane-panning-top-left@pipe-a-planes:
    - {shard-rkl}:        [SKIP][38] ([i915#1849]) -> [PASS][39] +2 similar issues
   [38]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-rkl-1/igt@kms_plane@plane-panning-top-left@pipe-a-planes.html
   [39]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-rkl-6/igt@kms_plane@plane-panning-top-left@pipe-a-planes.html

  * igt@kms_psr@cursor_blt:
    - {shard-rkl}:        [SKIP][40] ([i915#1072]) -> [PASS][41]
   [40]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-rkl-1/igt@kms_psr@cursor_blt.html
   [41]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-rkl-6/igt@kms_psr@cursor_blt.html

  * igt@kms_psr_stress_test@invalidate-primary-flip-overlay:
    - {shard-rkl}:        [SKIP][42] ([i915#5461]) -> [PASS][43]
   [42]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-rkl-1/igt@kms_psr_stress_test@invalidate-primary-flip-overlay.html
   [43]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-rkl-6/igt@kms_psr_stress_test@invalidate-primary-flip-overlay.html

  * igt@perf@gen12-unprivileged-single-ctx-counters:
    - {shard-rkl}:        [SKIP][44] ([fdo#109289]) -> [PASS][45]
   [44]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-rkl-5/igt@perf@gen12-unprivileged-single-ctx-counters.html
   [45]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-rkl-1/igt@perf@gen12-unprivileged-single-ctx-counters.html

  * igt@prime_vgem@basic-fence-flip:
    - {shard-rkl}:        [SKIP][46] ([fdo#109295] / [i915#3708] / [i915#4098]) -> [PASS][47]
   [46]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-rkl-1/igt@prime_vgem@basic-fence-flip.html
   [47]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-rkl-6/igt@prime_vgem@basic-fence-flip.html

  * igt@prime_vgem@basic-fence-read:
    - {shard-rkl}:        [SKIP][48] ([fdo#109295] / [i915#3291] / [i915#3708]) -> [PASS][49]
   [48]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12867/shard-rkl-1/igt@prime_vgem@basic-fence-read.html
   [49]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/shard-rkl-5/igt@prime_vgem@basic-fence-read.html

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109274]: https://bugs.freedesktop.org/show_bug.cgi?id=109274
  [fdo#109279]: https://bugs.freedesktop.org/show_bug.cgi?id=109279
  [fdo#109280]: https://bugs.freedesktop.org/show_bug.cgi?id=109280
  [fdo#109289]: https://bugs.freedesktop.org/show_bug.cgi?id=109289
  [fdo#109295]: https://bugs.freedesktop.org/show_bug.cgi?id=109295
  [fdo#109308]: https://bugs.freedesktop.org/show_bug.cgi?id=109308
  [fdo#109309]: https://bugs.freedesktop.org/show_bug.cgi?id=109309
  [fdo#109315]: https://bugs.freedesktop.org/show_bug.cgi?id=109315
  [fdo#110189]: https://bugs.freedesktop.org/show_bug.cgi?id=110189
  [fdo#110723]: https://bugs.freedesktop.org/show_bug.cgi?id=110723
  [fdo#111068]: https://bugs.freedesktop.org/show_bug.cgi?id=111068
  [fdo#111614]: https://bugs.freedesktop.org/show_bug.cgi?id=111614
  [fdo#111615]: https://bugs.freedesktop.org/show_bug.cgi?id=111615
  [fdo#111825]: https://bugs.freedesktop.org/show_bug.cgi?id=111825
  [fdo#111827]: https://bugs.freedesktop.org/show_bug.cgi?id=111827
  [fdo#112054]: https://bugs.freedesktop.org/show_bug.cgi?id=112054
  [fdo#112283]: https://bugs.freedesktop.org/show_bug.cgi?id=112283
  [i915#1072]: https://gitlab.freedesktop.org/drm/intel/issues/1072
  [i915#132]: https://gitlab.freedesktop.org/drm/intel/issues/132
  [i915#1397]: https://gitlab.freedesktop.org/drm/intel/issues/1397
  [i915#1722]: https://gitlab.freedesktop.org/drm/intel/issues/1722
  [i915#180]: https://gitlab.freedesktop.org/drm/intel/issues/180
  [i915#1825]: https://gitlab.freedesktop.org/drm/intel/issues/1825
  [i915#1845]: https://gitlab.freedesktop.org/drm/intel/issues/1845
  [i915#1849]: https://gitlab.freedesktop.org/drm/intel/issues/1849
  [i915#1902]: https://gitlab.freedesktop.org/drm/intel/issues/1902
  [i915#2122]: https://gitlab.freedesktop.org/drm/intel/issues/2122
  [i915#2346]: https://gitlab.freedesktop.org/drm/intel/issues/2346
  [i915#2527]: https://gitlab.freedesktop.org/drm/intel/issues/2527
  [i915#2532]: https://gitlab.freedesktop.org/drm/intel/issues/2532
  [i915#2575]: https://gitlab.freedesktop.org/drm/intel/issues/2575
  [i915#2582]: https://gitlab.freedesktop.org/drm/intel/issues/2582
  [i915#2587]: https://gitlab.freedesktop.org/drm/intel/issues/2587
  [i915#2672]: https://gitlab.freedesktop.org/drm/intel/issues/2672
  [i915#2705]: https://gitlab.freedesktop.org/drm/intel/issues/2705
  [i915#280]: https://gitlab.freedesktop.org/drm/intel/issues/280
  [i915#2842]: https://gitlab.freedesktop.org/drm/intel/issues/2842
  [i915#2920]: https://gitlab.freedesktop.org/drm/intel/issues/2920
  [i915#3116]: https://gitlab.freedesktop.org/drm/intel/issues/3116
  [i915#315]: https://gitlab.freedesktop.org/drm/intel/issues/315
  [i915#3281]: https://gitlab.freedesktop.org/drm/intel/issues/3281
  [i915#3282]: https://gitlab.freedesktop.org/drm/intel/issues/3282
  [i915#3291]: https://gitlab.freedesktop.org/drm/intel/issues/3291
  [i915#3297]: https://gitlab.freedesktop.org/drm/intel/issues/3297
  [i915#3299]: https://gitlab.freedesktop.org/drm/intel/issues/3299
  [i915#3323]: https://gitlab.freedesktop.org/drm/intel/issues/3323
  [i915#3359]: https://gitlab.freedesktop.org/drm/intel/issues/3359
  [i915#3458]: https://gitlab.freedesktop.org/drm/intel/issues/3458
  [i915#3539]: https://gitlab.freedesktop.org/drm/intel/issues/3539
  [i915#3546]: https://gitlab.freedesktop.org/drm/intel/issues/3546
  [i915#3547]: https://gitlab.freedesktop.org/drm/intel/issues/3547
  [i915#3555]: https://gitlab.freedesktop.org/drm/intel/issues/3555
  [i915#3558]: https://gitlab.freedesktop.org/drm/intel/issues/3558
  [i915#3637]: https://gitlab.freedesktop.org/drm/intel/issues/3637
  [i915#3638]: https://gitlab.freedesktop.org/drm/intel/issues/3638
  [i915#3689]: https://gitlab.freedesktop.org/drm/intel/issues/3689
  [i915#3708]: https://gitlab.freedesktop.org/drm/intel/issues/3708
  [i915#3734]: https://gitlab.freedesktop.org/drm/intel/issues/3734
  [i915#3742]: https://gitlab.freedesktop.org/drm/intel/issues/3742
  [i915#3840]: https://gitlab.freedesktop.org/drm/intel/issues/3840
  [i915#3886]: https://gitlab.freedesktop.org/drm/intel/issues/3886
  [i915#3952]: https://gitlab.freedesktop.org/drm/intel/issues/3952
  [i915#3955]: https://gitlab.freedesktop.org/drm/intel/issues/3955
  [i915#404]: https://gitlab.freedesktop.org/drm/intel/issues/404
  [i915#4070]: https://gitlab.freedesktop.org/drm/intel/issues/4070
  [i915#4077]: https://gitlab.freedesktop.org/drm/intel/issues/4077
  [i915#4079]: https://gitlab.freedesktop.org/drm/intel/issues/4079
  [i915#4083]: https://gitlab.freedesktop.org/drm/intel/issues/4083
  [i915#4098]: https://gitlab.freedesktop.org/drm/intel/issues/4098
  [i915#4103]: https://gitlab.freedesktop.org/drm/intel/issues/4103
  [i915#4215]: https://gitlab.freedesktop.org/drm/intel/issues/4215
  [i915#4270]: https://gitlab.freedesktop.org/drm/intel/issues/4270
  [i915#433]: https://gitlab.freedesktop.org/drm/intel/issues/433
  [i915#4349]: https://gitlab.freedesktop.org/drm/intel/issues/4349
  [i915#4391]: https://gitlab.freedesktop.org/drm/intel/issues/4391
  [i915#4538]: https://gitlab.freedesktop.org/drm/intel/issues/4538
  [i915#4613]: https://gitlab.freedesktop.org/drm/intel/issues/4613
  [i915#4812]: https://gitlab.freedesktop.org/drm/intel/issues/4812
  [i915#4833]: https://gitlab.freedesktop.org/drm/intel/issues/4833
  [i915#4852]: https://gitlab.freedesktop.org/drm/intel/issues/4852
  [i915#4854]: https://gitlab.freedesktop.org/drm/intel/issues/4854
  [i915#4860]: https://gitlab.freedesktop.org/drm/intel/issues/4860
  [i915#5161]: https://gitlab.freedesktop.org/drm/intel/issues/5161
  [i915#5235]: https://gitlab.freedesktop.org/drm/intel/issues/5235
  [i915#5286]: https://gitlab.freedesktop.org/drm/intel/issues/5286
  [i915#5288]: https://gitlab.freedesktop.org/drm/intel/issues/5288
  [i915#5289]: https://gitlab.freedesktop.org/drm/intel/issues/5289
  [i915#5325]: https://gitlab.freedesktop.org/drm/intel/issues/5325
  [i915#533]: https://gitlab.freedesktop.org/drm/intel/issues/533
  [i915#5439]: https://gitlab.freedesktop.org/drm/intel/issues/5439
  [i915#5461]: https://gitlab.freedesktop.org/drm/intel/issues/5461
  [i915#5563]: https://gitlab.freedesktop.org/drm/intel/issues/5563
  [i915#5566]: https://gitlab.freedesktop.org/drm/intel/issues/5566
  [i915#6095]: https://gitlab.freedesktop.org/drm/intel/issues/6095
  [i915#6248]: https://gitlab.freedesktop.org/drm/intel/issues/6248
  [i915#6252]: https://gitlab.freedesktop.org/drm/intel/issues/6252
  [i915#6258]: https://gitlab.freedesktop.org/drm/intel/issues/6258
  [i915#6259]: https://gitlab.freedesktop.org/drm/intel/issues/6259
  [i915#6355]: https://gitlab.freedesktop.org/drm/intel/issues/6355
  [i915#6433]: https://gitlab.freedesktop.org/drm/intel/issues/6433
  [i915#6497]: https://gitlab.freedesktop.org/drm/intel/issues/6497
  [i915#6524]: https://gitlab.freedesktop.org/drm/intel/issues/6524
  [i915#658]: https://gitlab.freedesktop.org/drm/intel/issues/658
  [i915#6621]: https://gitlab.freedesktop.org/drm/intel/issues/6621
  [i915#6768]: https://gitlab.freedesktop.org/drm/intel/issues/6768
  [i915#6944]: https://gitlab.freedesktop.org/drm/intel/issues/6944
  [i915#6953]: https://gitlab.freedesktop.org/drm/intel/issues/6953
  [i915#7116]: https://gitlab.freedesktop.org/drm/intel/issues/7116
  [i915#7118]: https://gitlab.freedesktop.org/drm/intel/issues/7118
  [i915#7128]: https://gitlab.freedesktop.org/drm/intel/issues/7128
  [i915#7276]: https://gitlab.freedesktop.org/drm/intel/issues/7276
  [i915#7294]: https://gitlab.freedesktop.org/drm/intel/issues/7294
  [i915#7651]: https://gitlab.freedesktop.org/drm/intel/issues/7651
  [i915#7701]: https://gitlab.freedesktop.org/drm/intel/issues/7701
  [i915#7711]: https://gitlab.freedesktop.org/drm/intel/issues/7711
  [i915#7828]: https://gitlab.freedesktop.org/drm/intel/issues/7828
  [i915#7949]: https://gitlab.freedesktop.org/drm/intel/issues/7949
  [i915#7957]: https://gitlab.freedesktop.org/drm/intel/issues/7957
  [i915#7975]: https://gitlab.freedesktop.org/drm/intel/issues/7975
  [i915#8152]: https://gitlab.freedesktop.org/drm/intel/issues/8152
  [i915#8154]: https://gitlab.freedesktop.org/drm/intel/issues/8154
  [i915#8228]: https://gitlab.freedesktop.org/drm/intel/issues/8228
  [i915#8282]: https://gitlab.freedesktop.org/drm/intel/issues/8282


Build changes
-------------

  * Linux: CI_DRM_12867 -> Patchwork_115003v3

  CI-20190529: 20190529
  CI_DRM_12867: 67d4276f8342780b8eaa6e9f5c15d979254a5675 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_7196: 9b8c5dbe8cd82163ee198c43b81222d2b9b75fd4 @ https://gitlab.freedesktop.org/drm/igt-gpu-tools.git
  Patchwork_115003v3: 67d4276f8342780b8eaa6e9f5c15d979254a5675 @ git://anongit.freedesktop.org/gfx-ci/linux
  piglit_4509: fdc5a4ca11124ab8413c7988896eec4c97336694 @ git://anongit.freedesktop.org/piglit

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115003v3/index.html

[-- Attachment #2: Type: text/html, Size: 14539 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
  2023-03-16  3:59 ` [Intel-gfx] " Ashutosh Dixit
@ 2023-03-24 18:15   ` Belgaumkar, Vinay
  -1 siblings, 0 replies; 25+ messages in thread
From: Belgaumkar, Vinay @ 2023-03-24 18:15 UTC (permalink / raw)
  To: Ashutosh Dixit, intel-gfx
  Cc: John Harrison, Badal Nilawar, dri-devel, Rodrigo Vivi


On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
> On dGfx, the PL1 power limit being enabled and set to a low value results
> in a low GPU operating freq. It also negates the freq raise operation which
> is done before GuC firmware load. As a result GuC firmware load can time
> out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
> limit was enabled and set to a low value). Therefore disable the PL1 power
> limit when allowed by HW when loading GuC firmware.
v3 label missing in subject.
>
> v2:
>   - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
>   - Add hwm_power_max_restore to error return code path
>
> v3 (Jani N):
>   - Add/remove explanatory comments
>   - Function renames
>   - Type corrections
>   - Locking annotation
>
> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
> Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
>   drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
>   drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
>   3 files changed, 55 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> index 4ccb4be4c9cba..aa8e35a5636a0 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> @@ -18,6 +18,7 @@
>   #include "intel_uc.h"
>   
>   #include "i915_drv.h"
> +#include "i915_hwmon.h"
>   
>   static const struct intel_uc_ops uc_ops_off;
>   static const struct intel_uc_ops uc_ops_on;
> @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
>   	struct intel_guc *guc = &uc->guc;
>   	struct intel_huc *huc = &uc->huc;
>   	int ret, attempts;
> +	bool pl1en;

Init to 'false' here


>   
>   	GEM_BUG_ON(!intel_uc_supports_guc(uc));
>   	GEM_BUG_ON(!intel_uc_wants_guc(uc));
> @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
>   	else
>   		attempts = 1;
>   
> +	/* Disable a potentially low PL1 power limit to allow freq to be raised */
> +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
> +
>   	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
>   
>   	while (attempts--) {
> @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
>   		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
>   	}
>   
> +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> +
>   	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
>   	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
>   
> @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
>   	/* Return GT back to RPn */
>   	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
>   
> +	i915_hwmon_power_max_restore(gt->i915, pl1en);

if (pl1en)

     i915_hwmon_power_max_enable().

> +
>   	__uc_sanitize(uc);
>   
>   	if (!ret) {
> diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> index ee63a8fd88fc1..769b5bda4d53f 100644
> --- a/drivers/gpu/drm/i915/i915_hwmon.c
> +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
>   	}
>   }
>   
> +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old)
Shouldn't we call this i915_hwmon_package_pl1_disable()?
> +	__acquires(i915->hwmon->hwmon_lock)
> +{
> +	struct i915_hwmon *hwmon = i915->hwmon;
> +	intel_wakeref_t wakeref;
> +	u32 r;
> +
> +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> +		return;
> +
> +	/* Take mutex to prevent concurrent hwm_power_max_write */
> +	mutex_lock(&hwmon->hwmon_lock);
> +
> +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> +		r = intel_uncore_rmw(hwmon->ddat.uncore,
> +				     hwmon->rg.pkg_rapl_limit,
> +				     PKG_PWR_LIM_1_EN, 0);
Most of this code (lock and rmw parts) is already inside static void
hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?
> +
> +	*old = !!(r & PKG_PWR_LIM_1_EN);
> +}
> +
> +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
> +	__releases(i915->hwmon->hwmon_lock)
We can just call this i915_hwmon_power_max_enable() and call whenever 
the old value was actually enabled. That way, we have proper mirror 
functions.
> +{
> +	struct i915_hwmon *hwmon = i915->hwmon;
> +	intel_wakeref_t wakeref;
> +
> +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> +		return;
> +
> +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> +		intel_uncore_rmw(hwmon->ddat.uncore,
> +				 hwmon->rg.pkg_rapl_limit,
> +				 PKG_PWR_LIM_1_EN,
> +				 old ? PKG_PWR_LIM_1_EN : 0);

3rd param should be 0 here, else we will end up clearing other bits.

Thanks,

Vinay.

> +
> +	mutex_unlock(&hwmon->hwmon_lock);
> +}
> +
>   static umode_t
>   hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
>   {
> diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
> index 7ca9cf2c34c96..0fcb7de844061 100644
> --- a/drivers/gpu/drm/i915/i915_hwmon.h
> +++ b/drivers/gpu/drm/i915/i915_hwmon.h
> @@ -7,14 +7,21 @@
>   #ifndef __I915_HWMON_H__
>   #define __I915_HWMON_H__
>   
> +#include <linux/types.h>
> +
>   struct drm_i915_private;
> +struct intel_gt;
>   
>   #if IS_REACHABLE(CONFIG_HWMON)
>   void i915_hwmon_register(struct drm_i915_private *i915);
>   void i915_hwmon_unregister(struct drm_i915_private *i915);
> +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
> +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
>   #else
>   static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
>   static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
> +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
> +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
>   #endif
>   
>   #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
@ 2023-03-24 18:15   ` Belgaumkar, Vinay
  0 siblings, 0 replies; 25+ messages in thread
From: Belgaumkar, Vinay @ 2023-03-24 18:15 UTC (permalink / raw)
  To: Ashutosh Dixit, intel-gfx; +Cc: dri-devel, Rodrigo Vivi


On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
> On dGfx, the PL1 power limit being enabled and set to a low value results
> in a low GPU operating freq. It also negates the freq raise operation which
> is done before GuC firmware load. As a result GuC firmware load can time
> out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
> limit was enabled and set to a low value). Therefore disable the PL1 power
> limit when allowed by HW when loading GuC firmware.
v3 label missing in subject.
>
> v2:
>   - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
>   - Add hwm_power_max_restore to error return code path
>
> v3 (Jani N):
>   - Add/remove explanatory comments
>   - Function renames
>   - Type corrections
>   - Locking annotation
>
> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
> Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
>   drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
>   drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
>   3 files changed, 55 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> index 4ccb4be4c9cba..aa8e35a5636a0 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> @@ -18,6 +18,7 @@
>   #include "intel_uc.h"
>   
>   #include "i915_drv.h"
> +#include "i915_hwmon.h"
>   
>   static const struct intel_uc_ops uc_ops_off;
>   static const struct intel_uc_ops uc_ops_on;
> @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
>   	struct intel_guc *guc = &uc->guc;
>   	struct intel_huc *huc = &uc->huc;
>   	int ret, attempts;
> +	bool pl1en;

Init to 'false' here


>   
>   	GEM_BUG_ON(!intel_uc_supports_guc(uc));
>   	GEM_BUG_ON(!intel_uc_wants_guc(uc));
> @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
>   	else
>   		attempts = 1;
>   
> +	/* Disable a potentially low PL1 power limit to allow freq to be raised */
> +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
> +
>   	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
>   
>   	while (attempts--) {
> @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
>   		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
>   	}
>   
> +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> +
>   	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
>   	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
>   
> @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
>   	/* Return GT back to RPn */
>   	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
>   
> +	i915_hwmon_power_max_restore(gt->i915, pl1en);

if (pl1en)

     i915_hwmon_power_max_enable().

> +
>   	__uc_sanitize(uc);
>   
>   	if (!ret) {
> diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> index ee63a8fd88fc1..769b5bda4d53f 100644
> --- a/drivers/gpu/drm/i915/i915_hwmon.c
> +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
>   	}
>   }
>   
> +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old)
Shouldn't we call this i915_hwmon_package_pl1_disable()?
> +	__acquires(i915->hwmon->hwmon_lock)
> +{
> +	struct i915_hwmon *hwmon = i915->hwmon;
> +	intel_wakeref_t wakeref;
> +	u32 r;
> +
> +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> +		return;
> +
> +	/* Take mutex to prevent concurrent hwm_power_max_write */
> +	mutex_lock(&hwmon->hwmon_lock);
> +
> +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> +		r = intel_uncore_rmw(hwmon->ddat.uncore,
> +				     hwmon->rg.pkg_rapl_limit,
> +				     PKG_PWR_LIM_1_EN, 0);
Most of this code (lock and rmw parts) is already inside static void
hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?
> +
> +	*old = !!(r & PKG_PWR_LIM_1_EN);
> +}
> +
> +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
> +	__releases(i915->hwmon->hwmon_lock)
We can just call this i915_hwmon_power_max_enable() and call whenever 
the old value was actually enabled. That way, we have proper mirror 
functions.
> +{
> +	struct i915_hwmon *hwmon = i915->hwmon;
> +	intel_wakeref_t wakeref;
> +
> +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> +		return;
> +
> +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> +		intel_uncore_rmw(hwmon->ddat.uncore,
> +				 hwmon->rg.pkg_rapl_limit,
> +				 PKG_PWR_LIM_1_EN,
> +				 old ? PKG_PWR_LIM_1_EN : 0);

3rd param should be 0 here, else we will end up clearing other bits.

Thanks,

Vinay.

> +
> +	mutex_unlock(&hwmon->hwmon_lock);
> +}
> +
>   static umode_t
>   hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
>   {
> diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
> index 7ca9cf2c34c96..0fcb7de844061 100644
> --- a/drivers/gpu/drm/i915/i915_hwmon.h
> +++ b/drivers/gpu/drm/i915/i915_hwmon.h
> @@ -7,14 +7,21 @@
>   #ifndef __I915_HWMON_H__
>   #define __I915_HWMON_H__
>   
> +#include <linux/types.h>
> +
>   struct drm_i915_private;
> +struct intel_gt;
>   
>   #if IS_REACHABLE(CONFIG_HWMON)
>   void i915_hwmon_register(struct drm_i915_private *i915);
>   void i915_hwmon_unregister(struct drm_i915_private *i915);
> +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
> +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
>   #else
>   static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
>   static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
> +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
> +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
>   #endif
>   
>   #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
  2023-03-24 18:15   ` [Intel-gfx] " Belgaumkar, Vinay
@ 2023-03-24 23:31     ` Dixit, Ashutosh
  -1 siblings, 0 replies; 25+ messages in thread
From: Dixit, Ashutosh @ 2023-03-24 23:31 UTC (permalink / raw)
  To: Belgaumkar, Vinay
  Cc: intel-gfx, dri-devel, Badal Nilawar, Rodrigo Vivi, John Harrison

On Fri, 24 Mar 2023 11:15:02 -0700, Belgaumkar, Vinay wrote:
>

Hi Vinay,

Thanks for the review. Comments inline below.

> On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
> > On dGfx, the PL1 power limit being enabled and set to a low value results
> > in a low GPU operating freq. It also negates the freq raise operation which
> > is done before GuC firmware load. As a result GuC firmware load can time
> > out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
> > limit was enabled and set to a low value). Therefore disable the PL1 power
> > limit when allowed by HW when loading GuC firmware.
> v3 label missing in subject.
> >
> > v2:
> >   - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
> >   - Add hwm_power_max_restore to error return code path
> >
> > v3 (Jani N):
> >   - Add/remove explanatory comments
> >   - Function renames
> >   - Type corrections
> >   - Locking annotation
> >
> > Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
> > Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
> >   drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
> >   drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
> >   3 files changed, 55 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > index 4ccb4be4c9cba..aa8e35a5636a0 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > @@ -18,6 +18,7 @@
> >   #include "intel_uc.h"
> >     #include "i915_drv.h"
> > +#include "i915_hwmon.h"
> >     static const struct intel_uc_ops uc_ops_off;
> >   static const struct intel_uc_ops uc_ops_on;
> > @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
> >	struct intel_guc *guc = &uc->guc;
> >	struct intel_huc *huc = &uc->huc;
> >	int ret, attempts;
> > +	bool pl1en;
>
> Init to 'false' here

See next comment.

>
>
> >		GEM_BUG_ON(!intel_uc_supports_guc(uc));
> >	GEM_BUG_ON(!intel_uc_wants_guc(uc));
> > @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
> >	else
> >		attempts = 1;
> >   +	/* Disable a potentially low PL1 power limit to allow freq to be
> > raised */
> > +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
> > +
> >	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
> >		while (attempts--) {
> > @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> >		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> >	}
> >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> > +
> >	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
> >	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
> >   @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> >	/* Return GT back to RPn */
> >	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
>
> if (pl1en)
>
>     i915_hwmon_power_max_enable().

IMO it's better not to have checks in the main __uc_init_hw() function (if
we do this we'll need to add 2 checks in __uc_init_hw()). If you really
want we could do something like this inside
i915_hwmon_power_max_disable/i915_hwmon_power_max_restore. But for now I
am not making any changes.

(I can send a patch with the changes if you want to take a look but IMO it
will add more logic/code but without real benefits (it will save a rmw if
the limit was already disabled, but IMO this code is called so infrequently
(only during GuC resets) as to not have any significant impact)).

>
> > +
> >	__uc_sanitize(uc);
> >		if (!ret) {
> > diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> > index ee63a8fd88fc1..769b5bda4d53f 100644
> > --- a/drivers/gpu/drm/i915/i915_hwmon.c
> > +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> > @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
> >	}
> >   }
> >   +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool
> > *old)
> Shouldn't we call this i915_hwmon_package_pl1_disable()?

I did think of using "pl1" in the function name but then decided to retain
"power_max" because other hwmon functions for PL1 limit also use
"power_max" (hwm_power_max_read/hwm_power_max_write) and currently
"hwmon_power_max" is mapped to the PL1 limit. So "power_max" is used to
show that all these functions deal with the PL1 power limit.

There is a comment in __uc_init_hw() explaining "power_max" means the PL1
power limit.

> > +	__acquires(i915->hwmon->hwmon_lock)
> > +{
> > +	struct i915_hwmon *hwmon = i915->hwmon;
> > +	intel_wakeref_t wakeref;
> > +	u32 r;
> > +
> > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > +		return;
> > +
> > +	/* Take mutex to prevent concurrent hwm_power_max_write */
> > +	mutex_lock(&hwmon->hwmon_lock);
> > +
> > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > +		r = intel_uncore_rmw(hwmon->ddat.uncore,
> > +				     hwmon->rg.pkg_rapl_limit,
> > +				     PKG_PWR_LIM_1_EN, 0);
> Most of this code (lock and rmw parts) is already inside static void
> hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?

This was the case in v1 of the patch:

https://patchwork.freedesktop.org/patch/526393/?series=115003&rev=1

But now this cannot be done because if you notice we acquire the mutex in
i915_hwmon_power_max_disable() and release the mutex in
i915_hwmon_power_max_restore().

I explained the reason why this the mutex is handled this way in my reply
to Jani Nikula here:

https://patchwork.freedesktop.org/patch/526598/?series=115003&rev=2

Quoting below:

```
> > +	/* hwmon_lock mutex is unlocked in hwm_power_max_restore */
>
> Not too happy about that... any better ideas?

Afais, taking the mutex is the only fully correct solution (when we disable
the power limit, userspace can go re-enable it). Examples of partly
incorrect solutions (which don't take the mutex) include:

a. Don't take the mutex, don't do anything, ignore any changes to the value
   if it has changed during GuC reset/fw load (just overwrite the changed
   value). Con: changed value is lost.

b. Detect if the value has changed (the limit has been re-enabled) after we
   have disabled the limit and in that case skip restoring the value. But
   then someone can say why do we allow enabling the PL1 limit since we
   want to disable it.

Both these are very unlikely scenarios so they might work. But I would
first like to explore if holding a mutex across GuC reset is prolebmatic
since that is /the/ correct solution. But if anyone comes up with a reason
why that cannot be done we can look at these other not completely correct
options.
```

> > +
> > +	*old = !!(r & PKG_PWR_LIM_1_EN);
> > +}
> > +
> > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
> > +	__releases(i915->hwmon->hwmon_lock)
> We can just call this i915_hwmon_power_max_enable() and call whenever the
> old value was actually enabled. That way, we have proper mirror functions.

As I explained that would mean adding two checks in the main __uc_init_hw()
function which I am trying to avoid. So we have disable/restore pair.

> > +{
> > +	struct i915_hwmon *hwmon = i915->hwmon;
> > +	intel_wakeref_t wakeref;
> > +
> > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > +		return;
> > +
> > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > +		intel_uncore_rmw(hwmon->ddat.uncore,
> > +				 hwmon->rg.pkg_rapl_limit,
> > +				 PKG_PWR_LIM_1_EN,
> > +				 old ? PKG_PWR_LIM_1_EN : 0);
>
> 3rd param should be 0 here, else we will end up clearing other bits.

No see intel_uncore_rmw(), it will only clear the PKG_PWR_LIM_1_EN bit, so
the code here is correct. intel_uncore_rmw() does:

        val = (old & ~clear) | set;

So for now I am not making any changes, if you feel strongly about
something one way or another let me know. Anyway these comments should help
you understand the patch better so take a look and we can go from there.

Thanks.
--
Ashutosh

> > +
> > +	mutex_unlock(&hwmon->hwmon_lock);
> > +}
> > +
> >   static umode_t
> >   hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
> >   {
> > diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
> > index 7ca9cf2c34c96..0fcb7de844061 100644
> > --- a/drivers/gpu/drm/i915/i915_hwmon.h
> > +++ b/drivers/gpu/drm/i915/i915_hwmon.h
> > @@ -7,14 +7,21 @@
> >   #ifndef __I915_HWMON_H__
> >   #define __I915_HWMON_H__
> >   +#include <linux/types.h>
> > +
> >   struct drm_i915_private;
> > +struct intel_gt;
> >     #if IS_REACHABLE(CONFIG_HWMON)
> >   void i915_hwmon_register(struct drm_i915_private *i915);
> >   void i915_hwmon_unregister(struct drm_i915_private *i915);
> > +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
> > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
> >   #else
> >   static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
> >   static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
> > +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
> > +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
> >   #endif
> >     #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
@ 2023-03-24 23:31     ` Dixit, Ashutosh
  0 siblings, 0 replies; 25+ messages in thread
From: Dixit, Ashutosh @ 2023-03-24 23:31 UTC (permalink / raw)
  To: Belgaumkar, Vinay; +Cc: intel-gfx, dri-devel, Rodrigo Vivi

On Fri, 24 Mar 2023 11:15:02 -0700, Belgaumkar, Vinay wrote:
>

Hi Vinay,

Thanks for the review. Comments inline below.

> On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
> > On dGfx, the PL1 power limit being enabled and set to a low value results
> > in a low GPU operating freq. It also negates the freq raise operation which
> > is done before GuC firmware load. As a result GuC firmware load can time
> > out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
> > limit was enabled and set to a low value). Therefore disable the PL1 power
> > limit when allowed by HW when loading GuC firmware.
> v3 label missing in subject.
> >
> > v2:
> >   - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
> >   - Add hwm_power_max_restore to error return code path
> >
> > v3 (Jani N):
> >   - Add/remove explanatory comments
> >   - Function renames
> >   - Type corrections
> >   - Locking annotation
> >
> > Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
> > Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
> >   drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
> >   drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
> >   3 files changed, 55 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > index 4ccb4be4c9cba..aa8e35a5636a0 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > @@ -18,6 +18,7 @@
> >   #include "intel_uc.h"
> >     #include "i915_drv.h"
> > +#include "i915_hwmon.h"
> >     static const struct intel_uc_ops uc_ops_off;
> >   static const struct intel_uc_ops uc_ops_on;
> > @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
> >	struct intel_guc *guc = &uc->guc;
> >	struct intel_huc *huc = &uc->huc;
> >	int ret, attempts;
> > +	bool pl1en;
>
> Init to 'false' here

See next comment.

>
>
> >		GEM_BUG_ON(!intel_uc_supports_guc(uc));
> >	GEM_BUG_ON(!intel_uc_wants_guc(uc));
> > @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
> >	else
> >		attempts = 1;
> >   +	/* Disable a potentially low PL1 power limit to allow freq to be
> > raised */
> > +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
> > +
> >	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
> >		while (attempts--) {
> > @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> >		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> >	}
> >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> > +
> >	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
> >	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
> >   @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> >	/* Return GT back to RPn */
> >	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
>
> if (pl1en)
>
>     i915_hwmon_power_max_enable().

IMO it's better not to have checks in the main __uc_init_hw() function (if
we do this we'll need to add 2 checks in __uc_init_hw()). If you really
want we could do something like this inside
i915_hwmon_power_max_disable/i915_hwmon_power_max_restore. But for now I
am not making any changes.

(I can send a patch with the changes if you want to take a look but IMO it
will add more logic/code but without real benefits (it will save a rmw if
the limit was already disabled, but IMO this code is called so infrequently
(only during GuC resets) as to not have any significant impact)).

>
> > +
> >	__uc_sanitize(uc);
> >		if (!ret) {
> > diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> > index ee63a8fd88fc1..769b5bda4d53f 100644
> > --- a/drivers/gpu/drm/i915/i915_hwmon.c
> > +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> > @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
> >	}
> >   }
> >   +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool
> > *old)
> Shouldn't we call this i915_hwmon_package_pl1_disable()?

I did think of using "pl1" in the function name but then decided to retain
"power_max" because other hwmon functions for PL1 limit also use
"power_max" (hwm_power_max_read/hwm_power_max_write) and currently
"hwmon_power_max" is mapped to the PL1 limit. So "power_max" is used to
show that all these functions deal with the PL1 power limit.

There is a comment in __uc_init_hw() explaining "power_max" means the PL1
power limit.

> > +	__acquires(i915->hwmon->hwmon_lock)
> > +{
> > +	struct i915_hwmon *hwmon = i915->hwmon;
> > +	intel_wakeref_t wakeref;
> > +	u32 r;
> > +
> > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > +		return;
> > +
> > +	/* Take mutex to prevent concurrent hwm_power_max_write */
> > +	mutex_lock(&hwmon->hwmon_lock);
> > +
> > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > +		r = intel_uncore_rmw(hwmon->ddat.uncore,
> > +				     hwmon->rg.pkg_rapl_limit,
> > +				     PKG_PWR_LIM_1_EN, 0);
> Most of this code (lock and rmw parts) is already inside static void
> hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?

This was the case in v1 of the patch:

https://patchwork.freedesktop.org/patch/526393/?series=115003&rev=1

But now this cannot be done because if you notice we acquire the mutex in
i915_hwmon_power_max_disable() and release the mutex in
i915_hwmon_power_max_restore().

I explained the reason why this the mutex is handled this way in my reply
to Jani Nikula here:

https://patchwork.freedesktop.org/patch/526598/?series=115003&rev=2

Quoting below:

```
> > +	/* hwmon_lock mutex is unlocked in hwm_power_max_restore */
>
> Not too happy about that... any better ideas?

Afais, taking the mutex is the only fully correct solution (when we disable
the power limit, userspace can go re-enable it). Examples of partly
incorrect solutions (which don't take the mutex) include:

a. Don't take the mutex, don't do anything, ignore any changes to the value
   if it has changed during GuC reset/fw load (just overwrite the changed
   value). Con: changed value is lost.

b. Detect if the value has changed (the limit has been re-enabled) after we
   have disabled the limit and in that case skip restoring the value. But
   then someone can say why do we allow enabling the PL1 limit since we
   want to disable it.

Both these are very unlikely scenarios so they might work. But I would
first like to explore if holding a mutex across GuC reset is prolebmatic
since that is /the/ correct solution. But if anyone comes up with a reason
why that cannot be done we can look at these other not completely correct
options.
```

> > +
> > +	*old = !!(r & PKG_PWR_LIM_1_EN);
> > +}
> > +
> > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
> > +	__releases(i915->hwmon->hwmon_lock)
> We can just call this i915_hwmon_power_max_enable() and call whenever the
> old value was actually enabled. That way, we have proper mirror functions.

As I explained that would mean adding two checks in the main __uc_init_hw()
function which I am trying to avoid. So we have disable/restore pair.

> > +{
> > +	struct i915_hwmon *hwmon = i915->hwmon;
> > +	intel_wakeref_t wakeref;
> > +
> > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > +		return;
> > +
> > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > +		intel_uncore_rmw(hwmon->ddat.uncore,
> > +				 hwmon->rg.pkg_rapl_limit,
> > +				 PKG_PWR_LIM_1_EN,
> > +				 old ? PKG_PWR_LIM_1_EN : 0);
>
> 3rd param should be 0 here, else we will end up clearing other bits.

No see intel_uncore_rmw(), it will only clear the PKG_PWR_LIM_1_EN bit, so
the code here is correct. intel_uncore_rmw() does:

        val = (old & ~clear) | set;

So for now I am not making any changes, if you feel strongly about
something one way or another let me know. Anyway these comments should help
you understand the patch better so take a look and we can go from there.

Thanks.
--
Ashutosh

> > +
> > +	mutex_unlock(&hwmon->hwmon_lock);
> > +}
> > +
> >   static umode_t
> >   hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
> >   {
> > diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
> > index 7ca9cf2c34c96..0fcb7de844061 100644
> > --- a/drivers/gpu/drm/i915/i915_hwmon.h
> > +++ b/drivers/gpu/drm/i915/i915_hwmon.h
> > @@ -7,14 +7,21 @@
> >   #ifndef __I915_HWMON_H__
> >   #define __I915_HWMON_H__
> >   +#include <linux/types.h>
> > +
> >   struct drm_i915_private;
> > +struct intel_gt;
> >     #if IS_REACHABLE(CONFIG_HWMON)
> >   void i915_hwmon_register(struct drm_i915_private *i915);
> >   void i915_hwmon_unregister(struct drm_i915_private *i915);
> > +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
> > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
> >   #else
> >   static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
> >   static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
> > +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
> > +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
> >   #endif
> >     #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
  2023-03-24 23:31     ` [Intel-gfx] " Dixit, Ashutosh
@ 2023-03-25  0:06       ` Belgaumkar, Vinay
  -1 siblings, 0 replies; 25+ messages in thread
From: Belgaumkar, Vinay @ 2023-03-25  0:06 UTC (permalink / raw)
  To: Dixit, Ashutosh
  Cc: intel-gfx, dri-devel, Badal Nilawar, Rodrigo Vivi, John Harrison


On 3/24/2023 4:31 PM, Dixit, Ashutosh wrote:
> On Fri, 24 Mar 2023 11:15:02 -0700, Belgaumkar, Vinay wrote:
> Hi Vinay,
>
> Thanks for the review. Comments inline below.
Sorry about asking the same questions all over again :) Didn't look at 
previous versions.
>
>> On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
>>> On dGfx, the PL1 power limit being enabled and set to a low value results
>>> in a low GPU operating freq. It also negates the freq raise operation which
>>> is done before GuC firmware load. As a result GuC firmware load can time
>>> out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
>>> limit was enabled and set to a low value). Therefore disable the PL1 power
>>> limit when allowed by HW when loading GuC firmware.
>> v3 label missing in subject.
>>> v2:
>>>    - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
>>>    - Add hwm_power_max_restore to error return code path
>>>
>>> v3 (Jani N):
>>>    - Add/remove explanatory comments
>>>    - Function renames
>>>    - Type corrections
>>>    - Locking annotation
>>>
>>> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
>>> Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
>>>    drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
>>>    drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
>>>    3 files changed, 55 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>>> index 4ccb4be4c9cba..aa8e35a5636a0 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>>> @@ -18,6 +18,7 @@
>>>    #include "intel_uc.h"
>>>      #include "i915_drv.h"
>>> +#include "i915_hwmon.h"
>>>      static const struct intel_uc_ops uc_ops_off;
>>>    static const struct intel_uc_ops uc_ops_on;
>>> @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
>>> 	struct intel_guc *guc = &uc->guc;
>>> 	struct intel_huc *huc = &uc->huc;
>>> 	int ret, attempts;
>>> +	bool pl1en;
>> Init to 'false' here
> See next comment.
>
>>
>>> 		GEM_BUG_ON(!intel_uc_supports_guc(uc));
>>> 	GEM_BUG_ON(!intel_uc_wants_guc(uc));
>>> @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
>>> 	else
>>> 		attempts = 1;
>>>    +	/* Disable a potentially low PL1 power limit to allow freq to be
>>> raised */
>>> +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
>>> +
>>> 	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
>>> 		while (attempts--) {
>>> @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
>>> 		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
>>> 	}
>>>    +	i915_hwmon_power_max_restore(gt->i915, pl1en);
>>> +
>>> 	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
>>> 	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
>>>    @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
>>> 	/* Return GT back to RPn */
>>> 	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
>>>    +	i915_hwmon_power_max_restore(gt->i915, pl1en);
>> if (pl1en)
>>
>>      i915_hwmon_power_max_enable().
> IMO it's better not to have checks in the main __uc_init_hw() function (if
> we do this we'll need to add 2 checks in __uc_init_hw()). If you really
> want we could do something like this inside
> i915_hwmon_power_max_disable/i915_hwmon_power_max_restore. But for now I
> am not making any changes.
ok.
>
> (I can send a patch with the changes if you want to take a look but IMO it
> will add more logic/code but without real benefits (it will save a rmw if
> the limit was already disabled, but IMO this code is called so infrequently
> (only during GuC resets) as to not have any significant impact)).
>
>>> +
>>> 	__uc_sanitize(uc);
>>> 		if (!ret) {
>>> diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
>>> index ee63a8fd88fc1..769b5bda4d53f 100644
>>> --- a/drivers/gpu/drm/i915/i915_hwmon.c
>>> +++ b/drivers/gpu/drm/i915/i915_hwmon.c
>>> @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
>>> 	}
>>>    }
>>>    +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool
>>> *old)
>> Shouldn't we call this i915_hwmon_package_pl1_disable()?
> I did think of using "pl1" in the function name but then decided to retain
> "power_max" because other hwmon functions for PL1 limit also use
> "power_max" (hwm_power_max_read/hwm_power_max_write) and currently
> "hwmon_power_max" is mapped to the PL1 limit. So "power_max" is used to
> show that all these functions deal with the PL1 power limit.
>
> There is a comment in __uc_init_hw() explaining "power_max" means the PL1
> power limit.
ok.
>
>>> +	__acquires(i915->hwmon->hwmon_lock)
>>> +{
>>> +	struct i915_hwmon *hwmon = i915->hwmon;
>>> +	intel_wakeref_t wakeref;
>>> +	u32 r;
>>> +
>>> +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
>>> +		return;
>>> +
>>> +	/* Take mutex to prevent concurrent hwm_power_max_write */
>>> +	mutex_lock(&hwmon->hwmon_lock);
>>> +
>>> +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
>>> +		r = intel_uncore_rmw(hwmon->ddat.uncore,
>>> +				     hwmon->rg.pkg_rapl_limit,
>>> +				     PKG_PWR_LIM_1_EN, 0);
>> Most of this code (lock and rmw parts) is already inside static void
>> hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?
> This was the case in v1 of the patch:
>
> https://patchwork.freedesktop.org/patch/526393/?series=115003&rev=1
>
> But now this cannot be done because if you notice we acquire the mutex in
> i915_hwmon_power_max_disable() and release the mutex in
> i915_hwmon_power_max_restore().
>
> I explained the reason why this the mutex is handled this way in my reply
> to Jani Nikula here:
>
> https://patchwork.freedesktop.org/patch/526598/?series=115003&rev=2
>
> Quoting below:
>
> ```
>>> +	/* hwmon_lock mutex is unlocked in hwm_power_max_restore */
>> Not too happy about that... any better ideas?
> Afais, taking the mutex is the only fully correct solution (when we disable
> the power limit, userspace can go re-enable it). Examples of partly
> incorrect solutions (which don't take the mutex) include:
>
> a. Don't take the mutex, don't do anything, ignore any changes to the value
>     if it has changed during GuC reset/fw load (just overwrite the changed
>     value). Con: changed value is lost.
>
> b. Detect if the value has changed (the limit has been re-enabled) after we
>     have disabled the limit and in that case skip restoring the value. But
>     then someone can say why do we allow enabling the PL1 limit since we
>     want to disable it.
>
> Both these are very unlikely scenarios so they might work. But I would
> first like to explore if holding a mutex across GuC reset is prolebmatic
> since that is /the/ correct solution. But if anyone comes up with a reason
> why that cannot be done we can look at these other not completely correct
> options.

Well, one reason is that this is adding a lot of duplicate/non-reusable 
code needlessly. If it gets re-used elsewhere, that could lead to some 
weird situations where the lock could be held for an extended period of 
time and introduce dependencies. Also, how/why would the user modify 
this PL1 during guc load? The sysfs interfaces are not even ready at 
this point? Even if we consider this during a resume, the terminal will 
not be available to the user.

Thanks,

Vinay.

> ```
>
>>> +
>>> +	*old = !!(r & PKG_PWR_LIM_1_EN);
>>> +}
>>> +
>>> +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
>>> +	__releases(i915->hwmon->hwmon_lock)
>> We can just call this i915_hwmon_power_max_enable() and call whenever the
>> old value was actually enabled. That way, we have proper mirror functions.
> As I explained that would mean adding two checks in the main __uc_init_hw()
> function which I am trying to avoid. So we have disable/restore pair.
>
>>> +{
>>> +	struct i915_hwmon *hwmon = i915->hwmon;
>>> +	intel_wakeref_t wakeref;
>>> +
>>> +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
>>> +		return;
>>> +
>>> +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
>>> +		intel_uncore_rmw(hwmon->ddat.uncore,
>>> +				 hwmon->rg.pkg_rapl_limit,
>>> +				 PKG_PWR_LIM_1_EN,
>>> +				 old ? PKG_PWR_LIM_1_EN : 0);
>> 3rd param should be 0 here, else we will end up clearing other bits.
> No see intel_uncore_rmw(), it will only clear the PKG_PWR_LIM_1_EN bit, so
> the code here is correct. intel_uncore_rmw() does:
>
>          val = (old & ~clear) | set;
Ok, just confusing, since you are also setting it with the 4th param.
>
> So for now I am not making any changes, if you feel strongly about
> something one way or another let me know. Anyway these comments should help
> you understand the patch better so take a look and we can go from there.
>
> Thanks.
> --
> Ashutosh
>
>>> +
>>> +	mutex_unlock(&hwmon->hwmon_lock);
>>> +}
>>> +
>>>    static umode_t
>>>    hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
>>>    {
>>> diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
>>> index 7ca9cf2c34c96..0fcb7de844061 100644
>>> --- a/drivers/gpu/drm/i915/i915_hwmon.h
>>> +++ b/drivers/gpu/drm/i915/i915_hwmon.h
>>> @@ -7,14 +7,21 @@
>>>    #ifndef __I915_HWMON_H__
>>>    #define __I915_HWMON_H__
>>>    +#include <linux/types.h>
>>> +
>>>    struct drm_i915_private;
>>> +struct intel_gt;
>>>      #if IS_REACHABLE(CONFIG_HWMON)
>>>    void i915_hwmon_register(struct drm_i915_private *i915);
>>>    void i915_hwmon_unregister(struct drm_i915_private *i915);
>>> +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
>>> +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
>>>    #else
>>>    static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
>>>    static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
>>> +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
>>> +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
>>>    #endif
>>>      #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
@ 2023-03-25  0:06       ` Belgaumkar, Vinay
  0 siblings, 0 replies; 25+ messages in thread
From: Belgaumkar, Vinay @ 2023-03-25  0:06 UTC (permalink / raw)
  To: Dixit, Ashutosh; +Cc: intel-gfx, dri-devel, Rodrigo Vivi


On 3/24/2023 4:31 PM, Dixit, Ashutosh wrote:
> On Fri, 24 Mar 2023 11:15:02 -0700, Belgaumkar, Vinay wrote:
> Hi Vinay,
>
> Thanks for the review. Comments inline below.
Sorry about asking the same questions all over again :) Didn't look at 
previous versions.
>
>> On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
>>> On dGfx, the PL1 power limit being enabled and set to a low value results
>>> in a low GPU operating freq. It also negates the freq raise operation which
>>> is done before GuC firmware load. As a result GuC firmware load can time
>>> out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
>>> limit was enabled and set to a low value). Therefore disable the PL1 power
>>> limit when allowed by HW when loading GuC firmware.
>> v3 label missing in subject.
>>> v2:
>>>    - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
>>>    - Add hwm_power_max_restore to error return code path
>>>
>>> v3 (Jani N):
>>>    - Add/remove explanatory comments
>>>    - Function renames
>>>    - Type corrections
>>>    - Locking annotation
>>>
>>> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
>>> Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
>>>    drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
>>>    drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
>>>    3 files changed, 55 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>>> index 4ccb4be4c9cba..aa8e35a5636a0 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>>> @@ -18,6 +18,7 @@
>>>    #include "intel_uc.h"
>>>      #include "i915_drv.h"
>>> +#include "i915_hwmon.h"
>>>      static const struct intel_uc_ops uc_ops_off;
>>>    static const struct intel_uc_ops uc_ops_on;
>>> @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
>>> 	struct intel_guc *guc = &uc->guc;
>>> 	struct intel_huc *huc = &uc->huc;
>>> 	int ret, attempts;
>>> +	bool pl1en;
>> Init to 'false' here
> See next comment.
>
>>
>>> 		GEM_BUG_ON(!intel_uc_supports_guc(uc));
>>> 	GEM_BUG_ON(!intel_uc_wants_guc(uc));
>>> @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
>>> 	else
>>> 		attempts = 1;
>>>    +	/* Disable a potentially low PL1 power limit to allow freq to be
>>> raised */
>>> +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
>>> +
>>> 	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
>>> 		while (attempts--) {
>>> @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
>>> 		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
>>> 	}
>>>    +	i915_hwmon_power_max_restore(gt->i915, pl1en);
>>> +
>>> 	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
>>> 	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
>>>    @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
>>> 	/* Return GT back to RPn */
>>> 	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
>>>    +	i915_hwmon_power_max_restore(gt->i915, pl1en);
>> if (pl1en)
>>
>>      i915_hwmon_power_max_enable().
> IMO it's better not to have checks in the main __uc_init_hw() function (if
> we do this we'll need to add 2 checks in __uc_init_hw()). If you really
> want we could do something like this inside
> i915_hwmon_power_max_disable/i915_hwmon_power_max_restore. But for now I
> am not making any changes.
ok.
>
> (I can send a patch with the changes if you want to take a look but IMO it
> will add more logic/code but without real benefits (it will save a rmw if
> the limit was already disabled, but IMO this code is called so infrequently
> (only during GuC resets) as to not have any significant impact)).
>
>>> +
>>> 	__uc_sanitize(uc);
>>> 		if (!ret) {
>>> diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
>>> index ee63a8fd88fc1..769b5bda4d53f 100644
>>> --- a/drivers/gpu/drm/i915/i915_hwmon.c
>>> +++ b/drivers/gpu/drm/i915/i915_hwmon.c
>>> @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
>>> 	}
>>>    }
>>>    +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool
>>> *old)
>> Shouldn't we call this i915_hwmon_package_pl1_disable()?
> I did think of using "pl1" in the function name but then decided to retain
> "power_max" because other hwmon functions for PL1 limit also use
> "power_max" (hwm_power_max_read/hwm_power_max_write) and currently
> "hwmon_power_max" is mapped to the PL1 limit. So "power_max" is used to
> show that all these functions deal with the PL1 power limit.
>
> There is a comment in __uc_init_hw() explaining "power_max" means the PL1
> power limit.
ok.
>
>>> +	__acquires(i915->hwmon->hwmon_lock)
>>> +{
>>> +	struct i915_hwmon *hwmon = i915->hwmon;
>>> +	intel_wakeref_t wakeref;
>>> +	u32 r;
>>> +
>>> +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
>>> +		return;
>>> +
>>> +	/* Take mutex to prevent concurrent hwm_power_max_write */
>>> +	mutex_lock(&hwmon->hwmon_lock);
>>> +
>>> +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
>>> +		r = intel_uncore_rmw(hwmon->ddat.uncore,
>>> +				     hwmon->rg.pkg_rapl_limit,
>>> +				     PKG_PWR_LIM_1_EN, 0);
>> Most of this code (lock and rmw parts) is already inside static void
>> hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?
> This was the case in v1 of the patch:
>
> https://patchwork.freedesktop.org/patch/526393/?series=115003&rev=1
>
> But now this cannot be done because if you notice we acquire the mutex in
> i915_hwmon_power_max_disable() and release the mutex in
> i915_hwmon_power_max_restore().
>
> I explained the reason why this the mutex is handled this way in my reply
> to Jani Nikula here:
>
> https://patchwork.freedesktop.org/patch/526598/?series=115003&rev=2
>
> Quoting below:
>
> ```
>>> +	/* hwmon_lock mutex is unlocked in hwm_power_max_restore */
>> Not too happy about that... any better ideas?
> Afais, taking the mutex is the only fully correct solution (when we disable
> the power limit, userspace can go re-enable it). Examples of partly
> incorrect solutions (which don't take the mutex) include:
>
> a. Don't take the mutex, don't do anything, ignore any changes to the value
>     if it has changed during GuC reset/fw load (just overwrite the changed
>     value). Con: changed value is lost.
>
> b. Detect if the value has changed (the limit has been re-enabled) after we
>     have disabled the limit and in that case skip restoring the value. But
>     then someone can say why do we allow enabling the PL1 limit since we
>     want to disable it.
>
> Both these are very unlikely scenarios so they might work. But I would
> first like to explore if holding a mutex across GuC reset is prolebmatic
> since that is /the/ correct solution. But if anyone comes up with a reason
> why that cannot be done we can look at these other not completely correct
> options.

Well, one reason is that this is adding a lot of duplicate/non-reusable 
code needlessly. If it gets re-used elsewhere, that could lead to some 
weird situations where the lock could be held for an extended period of 
time and introduce dependencies. Also, how/why would the user modify 
this PL1 during guc load? The sysfs interfaces are not even ready at 
this point? Even if we consider this during a resume, the terminal will 
not be available to the user.

Thanks,

Vinay.

> ```
>
>>> +
>>> +	*old = !!(r & PKG_PWR_LIM_1_EN);
>>> +}
>>> +
>>> +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
>>> +	__releases(i915->hwmon->hwmon_lock)
>> We can just call this i915_hwmon_power_max_enable() and call whenever the
>> old value was actually enabled. That way, we have proper mirror functions.
> As I explained that would mean adding two checks in the main __uc_init_hw()
> function which I am trying to avoid. So we have disable/restore pair.
>
>>> +{
>>> +	struct i915_hwmon *hwmon = i915->hwmon;
>>> +	intel_wakeref_t wakeref;
>>> +
>>> +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
>>> +		return;
>>> +
>>> +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
>>> +		intel_uncore_rmw(hwmon->ddat.uncore,
>>> +				 hwmon->rg.pkg_rapl_limit,
>>> +				 PKG_PWR_LIM_1_EN,
>>> +				 old ? PKG_PWR_LIM_1_EN : 0);
>> 3rd param should be 0 here, else we will end up clearing other bits.
> No see intel_uncore_rmw(), it will only clear the PKG_PWR_LIM_1_EN bit, so
> the code here is correct. intel_uncore_rmw() does:
>
>          val = (old & ~clear) | set;
Ok, just confusing, since you are also setting it with the 4th param.
>
> So for now I am not making any changes, if you feel strongly about
> something one way or another let me know. Anyway these comments should help
> you understand the patch better so take a look and we can go from there.
>
> Thanks.
> --
> Ashutosh
>
>>> +
>>> +	mutex_unlock(&hwmon->hwmon_lock);
>>> +}
>>> +
>>>    static umode_t
>>>    hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
>>>    {
>>> diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
>>> index 7ca9cf2c34c96..0fcb7de844061 100644
>>> --- a/drivers/gpu/drm/i915/i915_hwmon.h
>>> +++ b/drivers/gpu/drm/i915/i915_hwmon.h
>>> @@ -7,14 +7,21 @@
>>>    #ifndef __I915_HWMON_H__
>>>    #define __I915_HWMON_H__
>>>    +#include <linux/types.h>
>>> +
>>>    struct drm_i915_private;
>>> +struct intel_gt;
>>>      #if IS_REACHABLE(CONFIG_HWMON)
>>>    void i915_hwmon_register(struct drm_i915_private *i915);
>>>    void i915_hwmon_unregister(struct drm_i915_private *i915);
>>> +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
>>> +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
>>>    #else
>>>    static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
>>>    static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
>>> +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
>>> +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
>>>    #endif
>>>      #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
  2023-03-24 23:31     ` [Intel-gfx] " Dixit, Ashutosh
@ 2023-03-26 11:52       ` Rodrigo Vivi
  -1 siblings, 0 replies; 25+ messages in thread
From: Rodrigo Vivi @ 2023-03-26 11:52 UTC (permalink / raw)
  To: Dixit, Ashutosh
  Cc: Belgaumkar, Vinay, intel-gfx, Badal Nilawar, dri-devel, John Harrison

On Fri, Mar 24, 2023 at 04:31:22PM -0700, Dixit, Ashutosh wrote:
> On Fri, 24 Mar 2023 11:15:02 -0700, Belgaumkar, Vinay wrote:
> >
> 
> Hi Vinay,
> 
> Thanks for the review. Comments inline below.
> 
> > On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
> > > On dGfx, the PL1 power limit being enabled and set to a low value results
> > > in a low GPU operating freq. It also negates the freq raise operation which
> > > is done before GuC firmware load. As a result GuC firmware load can time
> > > out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
> > > limit was enabled and set to a low value). Therefore disable the PL1 power
> > > limit when allowed by HW when loading GuC firmware.
> > v3 label missing in subject.
> > >
> > > v2:
> > >   - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
> > >   - Add hwm_power_max_restore to error return code path
> > >
> > > v3 (Jani N):
> > >   - Add/remove explanatory comments
> > >   - Function renames
> > >   - Type corrections
> > >   - Locking annotation
> > >
> > > Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
> > > Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> > > ---
> > >   drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
> > >   drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
> > >   drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
> > >   3 files changed, 55 insertions(+)
> > >
> > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > index 4ccb4be4c9cba..aa8e35a5636a0 100644
> > > --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > @@ -18,6 +18,7 @@
> > >   #include "intel_uc.h"
> > >     #include "i915_drv.h"
> > > +#include "i915_hwmon.h"
> > >     static const struct intel_uc_ops uc_ops_off;
> > >   static const struct intel_uc_ops uc_ops_on;
> > > @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
> > >	struct intel_guc *guc = &uc->guc;
> > >	struct intel_huc *huc = &uc->huc;
> > >	int ret, attempts;
> > > +	bool pl1en;
> >
> > Init to 'false' here
> 
> See next comment.
> 
> >
> >
> > >		GEM_BUG_ON(!intel_uc_supports_guc(uc));
> > >	GEM_BUG_ON(!intel_uc_wants_guc(uc));
> > > @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
> > >	else
> > >		attempts = 1;
> > >   +	/* Disable a potentially low PL1 power limit to allow freq to be
> > > raised */
> > > +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
> > > +
> > >	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
> > >		while (attempts--) {
> > > @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> > >		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> > >	}
> > >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> > > +
> > >	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
> > >	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
> > >   @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> > >	/* Return GT back to RPn */
> > >	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> > >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> >
> > if (pl1en)
> >
> >     i915_hwmon_power_max_enable().
> 
> IMO it's better not to have checks in the main __uc_init_hw() function (if
> we do this we'll need to add 2 checks in __uc_init_hw()). If you really
> want we could do something like this inside
> i915_hwmon_power_max_disable/i915_hwmon_power_max_restore. But for now I
> am not making any changes.
> 
> (I can send a patch with the changes if you want to take a look but IMO it
> will add more logic/code but without real benefits (it will save a rmw if
> the limit was already disabled, but IMO this code is called so infrequently
> (only during GuC resets) as to not have any significant impact)).
> 
> >
> > > +
> > >	__uc_sanitize(uc);
> > >		if (!ret) {
> > > diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> > > index ee63a8fd88fc1..769b5bda4d53f 100644
> > > --- a/drivers/gpu/drm/i915/i915_hwmon.c
> > > +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> > > @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
> > >	}
> > >   }
> > >   +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool
> > > *old)
> > Shouldn't we call this i915_hwmon_package_pl1_disable()?
> 
> I did think of using "pl1" in the function name but then decided to retain
> "power_max" because other hwmon functions for PL1 limit also use
> "power_max" (hwm_power_max_read/hwm_power_max_write) and currently
> "hwmon_power_max" is mapped to the PL1 limit. So "power_max" is used to
> show that all these functions deal with the PL1 power limit.
> 
> There is a comment in __uc_init_hw() explaining "power_max" means the PL1
> power limit.
> 
> > > +	__acquires(i915->hwmon->hwmon_lock)
> > > +{
> > > +	struct i915_hwmon *hwmon = i915->hwmon;
> > > +	intel_wakeref_t wakeref;
> > > +	u32 r;
> > > +
> > > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > > +		return;
> > > +
> > > +	/* Take mutex to prevent concurrent hwm_power_max_write */
> > > +	mutex_lock(&hwmon->hwmon_lock);
> > > +
> > > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > > +		r = intel_uncore_rmw(hwmon->ddat.uncore,
> > > +				     hwmon->rg.pkg_rapl_limit,
> > > +				     PKG_PWR_LIM_1_EN, 0);
> > Most of this code (lock and rmw parts) is already inside static void
> > hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?
> 
> This was the case in v1 of the patch:
> 
> https://patchwork.freedesktop.org/patch/526393/?series=115003&rev=1
> 
> But now this cannot be done because if you notice we acquire the mutex in
> i915_hwmon_power_max_disable() and release the mutex in
> i915_hwmon_power_max_restore().
> 
> I explained the reason why this the mutex is handled this way in my reply
> to Jani Nikula here:
> 
> https://patchwork.freedesktop.org/patch/526598/?series=115003&rev=2
> 
> Quoting below:
> 
> ```
> > > +	/* hwmon_lock mutex is unlocked in hwm_power_max_restore */
> >
> > Not too happy about that... any better ideas?
> 
> Afais, taking the mutex is the only fully correct solution (when we disable
> the power limit, userspace can go re-enable it). Examples of partly
> incorrect solutions (which don't take the mutex) include:
> 
> a. Don't take the mutex, don't do anything, ignore any changes to the value
>    if it has changed during GuC reset/fw load (just overwrite the changed
>    value). Con: changed value is lost.
> 
> b. Detect if the value has changed (the limit has been re-enabled) after we
>    have disabled the limit and in that case skip restoring the value. But
>    then someone can say why do we allow enabling the PL1 limit since we
>    want to disable it.
> 
> Both these are very unlikely scenarios so they might work. But I would
> first like to explore if holding a mutex across GuC reset is prolebmatic
> since that is /the/ correct solution. But if anyone comes up with a reason
> why that cannot be done we can look at these other not completely correct
> options.

I see what you are doing and it looks indeed a very safe approach to ensure
the pl1 won't be toggled by other paths while we need some guaranteed state
here, or hw init fails badly.

But in the end you are making your lock to protect the code from another path
and not protecting the data itself. The data was already protected in the
first version with the lock in the rmw.

maybe we need to have some kind of a state check with other state-lock and
then if we are in this forced state for init path, the request for the normal path
ignores and move one, or maybe we queue some request...


> ```
> 
> > > +
> > > +	*old = !!(r & PKG_PWR_LIM_1_EN);
> > > +}
> > > +
> > > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
> > > +	__releases(i915->hwmon->hwmon_lock)
> > We can just call this i915_hwmon_power_max_enable() and call whenever the
> > old value was actually enabled. That way, we have proper mirror functions.
> 
> As I explained that would mean adding two checks in the main __uc_init_hw()
> function which I am trying to avoid. So we have disable/restore pair.
> 
> > > +{
> > > +	struct i915_hwmon *hwmon = i915->hwmon;
> > > +	intel_wakeref_t wakeref;
> > > +
> > > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > > +		return;
> > > +
> > > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > > +		intel_uncore_rmw(hwmon->ddat.uncore,
> > > +				 hwmon->rg.pkg_rapl_limit,
> > > +				 PKG_PWR_LIM_1_EN,
> > > +				 old ? PKG_PWR_LIM_1_EN : 0);
> >
> > 3rd param should be 0 here, else we will end up clearing other bits.
> 
> No see intel_uncore_rmw(), it will only clear the PKG_PWR_LIM_1_EN bit, so
> the code here is correct. intel_uncore_rmw() does:
> 
>         val = (old & ~clear) | set;
> 
> So for now I am not making any changes, if you feel strongly about
> something one way or another let me know. Anyway these comments should help
> you understand the patch better so take a look and we can go from there.
> 
> Thanks.
> --
> Ashutosh
> 
> > > +
> > > +	mutex_unlock(&hwmon->hwmon_lock);
> > > +}
> > > +
> > >   static umode_t
> > >   hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
> > >   {
> > > diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
> > > index 7ca9cf2c34c96..0fcb7de844061 100644
> > > --- a/drivers/gpu/drm/i915/i915_hwmon.h
> > > +++ b/drivers/gpu/drm/i915/i915_hwmon.h
> > > @@ -7,14 +7,21 @@
> > >   #ifndef __I915_HWMON_H__
> > >   #define __I915_HWMON_H__
> > >   +#include <linux/types.h>
> > > +
> > >   struct drm_i915_private;
> > > +struct intel_gt;
> > >     #if IS_REACHABLE(CONFIG_HWMON)
> > >   void i915_hwmon_register(struct drm_i915_private *i915);
> > >   void i915_hwmon_unregister(struct drm_i915_private *i915);
> > > +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
> > > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
> > >   #else
> > >   static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
> > >   static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
> > > +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
> > > +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
> > >   #endif
> > >     #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
@ 2023-03-26 11:52       ` Rodrigo Vivi
  0 siblings, 0 replies; 25+ messages in thread
From: Rodrigo Vivi @ 2023-03-26 11:52 UTC (permalink / raw)
  To: Dixit, Ashutosh; +Cc: intel-gfx, dri-devel

On Fri, Mar 24, 2023 at 04:31:22PM -0700, Dixit, Ashutosh wrote:
> On Fri, 24 Mar 2023 11:15:02 -0700, Belgaumkar, Vinay wrote:
> >
> 
> Hi Vinay,
> 
> Thanks for the review. Comments inline below.
> 
> > On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
> > > On dGfx, the PL1 power limit being enabled and set to a low value results
> > > in a low GPU operating freq. It also negates the freq raise operation which
> > > is done before GuC firmware load. As a result GuC firmware load can time
> > > out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
> > > limit was enabled and set to a low value). Therefore disable the PL1 power
> > > limit when allowed by HW when loading GuC firmware.
> > v3 label missing in subject.
> > >
> > > v2:
> > >   - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
> > >   - Add hwm_power_max_restore to error return code path
> > >
> > > v3 (Jani N):
> > >   - Add/remove explanatory comments
> > >   - Function renames
> > >   - Type corrections
> > >   - Locking annotation
> > >
> > > Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
> > > Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> > > ---
> > >   drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
> > >   drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
> > >   drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
> > >   3 files changed, 55 insertions(+)
> > >
> > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > index 4ccb4be4c9cba..aa8e35a5636a0 100644
> > > --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > @@ -18,6 +18,7 @@
> > >   #include "intel_uc.h"
> > >     #include "i915_drv.h"
> > > +#include "i915_hwmon.h"
> > >     static const struct intel_uc_ops uc_ops_off;
> > >   static const struct intel_uc_ops uc_ops_on;
> > > @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
> > >	struct intel_guc *guc = &uc->guc;
> > >	struct intel_huc *huc = &uc->huc;
> > >	int ret, attempts;
> > > +	bool pl1en;
> >
> > Init to 'false' here
> 
> See next comment.
> 
> >
> >
> > >		GEM_BUG_ON(!intel_uc_supports_guc(uc));
> > >	GEM_BUG_ON(!intel_uc_wants_guc(uc));
> > > @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
> > >	else
> > >		attempts = 1;
> > >   +	/* Disable a potentially low PL1 power limit to allow freq to be
> > > raised */
> > > +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
> > > +
> > >	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
> > >		while (attempts--) {
> > > @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> > >		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> > >	}
> > >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> > > +
> > >	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
> > >	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
> > >   @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> > >	/* Return GT back to RPn */
> > >	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> > >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> >
> > if (pl1en)
> >
> >     i915_hwmon_power_max_enable().
> 
> IMO it's better not to have checks in the main __uc_init_hw() function (if
> we do this we'll need to add 2 checks in __uc_init_hw()). If you really
> want we could do something like this inside
> i915_hwmon_power_max_disable/i915_hwmon_power_max_restore. But for now I
> am not making any changes.
> 
> (I can send a patch with the changes if you want to take a look but IMO it
> will add more logic/code but without real benefits (it will save a rmw if
> the limit was already disabled, but IMO this code is called so infrequently
> (only during GuC resets) as to not have any significant impact)).
> 
> >
> > > +
> > >	__uc_sanitize(uc);
> > >		if (!ret) {
> > > diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> > > index ee63a8fd88fc1..769b5bda4d53f 100644
> > > --- a/drivers/gpu/drm/i915/i915_hwmon.c
> > > +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> > > @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
> > >	}
> > >   }
> > >   +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool
> > > *old)
> > Shouldn't we call this i915_hwmon_package_pl1_disable()?
> 
> I did think of using "pl1" in the function name but then decided to retain
> "power_max" because other hwmon functions for PL1 limit also use
> "power_max" (hwm_power_max_read/hwm_power_max_write) and currently
> "hwmon_power_max" is mapped to the PL1 limit. So "power_max" is used to
> show that all these functions deal with the PL1 power limit.
> 
> There is a comment in __uc_init_hw() explaining "power_max" means the PL1
> power limit.
> 
> > > +	__acquires(i915->hwmon->hwmon_lock)
> > > +{
> > > +	struct i915_hwmon *hwmon = i915->hwmon;
> > > +	intel_wakeref_t wakeref;
> > > +	u32 r;
> > > +
> > > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > > +		return;
> > > +
> > > +	/* Take mutex to prevent concurrent hwm_power_max_write */
> > > +	mutex_lock(&hwmon->hwmon_lock);
> > > +
> > > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > > +		r = intel_uncore_rmw(hwmon->ddat.uncore,
> > > +				     hwmon->rg.pkg_rapl_limit,
> > > +				     PKG_PWR_LIM_1_EN, 0);
> > Most of this code (lock and rmw parts) is already inside static void
> > hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?
> 
> This was the case in v1 of the patch:
> 
> https://patchwork.freedesktop.org/patch/526393/?series=115003&rev=1
> 
> But now this cannot be done because if you notice we acquire the mutex in
> i915_hwmon_power_max_disable() and release the mutex in
> i915_hwmon_power_max_restore().
> 
> I explained the reason why this the mutex is handled this way in my reply
> to Jani Nikula here:
> 
> https://patchwork.freedesktop.org/patch/526598/?series=115003&rev=2
> 
> Quoting below:
> 
> ```
> > > +	/* hwmon_lock mutex is unlocked in hwm_power_max_restore */
> >
> > Not too happy about that... any better ideas?
> 
> Afais, taking the mutex is the only fully correct solution (when we disable
> the power limit, userspace can go re-enable it). Examples of partly
> incorrect solutions (which don't take the mutex) include:
> 
> a. Don't take the mutex, don't do anything, ignore any changes to the value
>    if it has changed during GuC reset/fw load (just overwrite the changed
>    value). Con: changed value is lost.
> 
> b. Detect if the value has changed (the limit has been re-enabled) after we
>    have disabled the limit and in that case skip restoring the value. But
>    then someone can say why do we allow enabling the PL1 limit since we
>    want to disable it.
> 
> Both these are very unlikely scenarios so they might work. But I would
> first like to explore if holding a mutex across GuC reset is prolebmatic
> since that is /the/ correct solution. But if anyone comes up with a reason
> why that cannot be done we can look at these other not completely correct
> options.

I see what you are doing and it looks indeed a very safe approach to ensure
the pl1 won't be toggled by other paths while we need some guaranteed state
here, or hw init fails badly.

But in the end you are making your lock to protect the code from another path
and not protecting the data itself. The data was already protected in the
first version with the lock in the rmw.

maybe we need to have some kind of a state check with other state-lock and
then if we are in this forced state for init path, the request for the normal path
ignores and move one, or maybe we queue some request...


> ```
> 
> > > +
> > > +	*old = !!(r & PKG_PWR_LIM_1_EN);
> > > +}
> > > +
> > > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
> > > +	__releases(i915->hwmon->hwmon_lock)
> > We can just call this i915_hwmon_power_max_enable() and call whenever the
> > old value was actually enabled. That way, we have proper mirror functions.
> 
> As I explained that would mean adding two checks in the main __uc_init_hw()
> function which I am trying to avoid. So we have disable/restore pair.
> 
> > > +{
> > > +	struct i915_hwmon *hwmon = i915->hwmon;
> > > +	intel_wakeref_t wakeref;
> > > +
> > > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > > +		return;
> > > +
> > > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > > +		intel_uncore_rmw(hwmon->ddat.uncore,
> > > +				 hwmon->rg.pkg_rapl_limit,
> > > +				 PKG_PWR_LIM_1_EN,
> > > +				 old ? PKG_PWR_LIM_1_EN : 0);
> >
> > 3rd param should be 0 here, else we will end up clearing other bits.
> 
> No see intel_uncore_rmw(), it will only clear the PKG_PWR_LIM_1_EN bit, so
> the code here is correct. intel_uncore_rmw() does:
> 
>         val = (old & ~clear) | set;
> 
> So for now I am not making any changes, if you feel strongly about
> something one way or another let me know. Anyway these comments should help
> you understand the patch better so take a look and we can go from there.
> 
> Thanks.
> --
> Ashutosh
> 
> > > +
> > > +	mutex_unlock(&hwmon->hwmon_lock);
> > > +}
> > > +
> > >   static umode_t
> > >   hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
> > >   {
> > > diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
> > > index 7ca9cf2c34c96..0fcb7de844061 100644
> > > --- a/drivers/gpu/drm/i915/i915_hwmon.h
> > > +++ b/drivers/gpu/drm/i915/i915_hwmon.h
> > > @@ -7,14 +7,21 @@
> > >   #ifndef __I915_HWMON_H__
> > >   #define __I915_HWMON_H__
> > >   +#include <linux/types.h>
> > > +
> > >   struct drm_i915_private;
> > > +struct intel_gt;
> > >     #if IS_REACHABLE(CONFIG_HWMON)
> > >   void i915_hwmon_register(struct drm_i915_private *i915);
> > >   void i915_hwmon_unregister(struct drm_i915_private *i915);
> > > +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
> > > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
> > >   #else
> > >   static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
> > >   static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
> > > +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
> > > +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
> > >   #endif
> > >     #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
  2023-03-25  0:06       ` [Intel-gfx] " Belgaumkar, Vinay
@ 2023-03-27 16:57         ` Dixit, Ashutosh
  -1 siblings, 0 replies; 25+ messages in thread
From: Dixit, Ashutosh @ 2023-03-27 16:57 UTC (permalink / raw)
  To: Belgaumkar, Vinay
  Cc: intel-gfx, dri-devel, Badal Nilawar, Rodrigo Vivi, John Harrison

On Fri, 24 Mar 2023 17:06:33 -0700, Belgaumkar, Vinay wrote:
>

Hi Vinay,

> On 3/24/2023 4:31 PM, Dixit, Ashutosh wrote:
> > On Fri, 24 Mar 2023 11:15:02 -0700, Belgaumkar, Vinay wrote:
> > Hi Vinay,
> >
> > Thanks for the review. Comments inline below.
> Sorry about asking the same questions all over again :) Didn't look at
> previous versions.

Np, the previous versions were buried somewhere anyway that's why I
provided the link.

> >
> >> On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
> >>> On dGfx, the PL1 power limit being enabled and set to a low value results
> >>> in a low GPU operating freq. It also negates the freq raise operation which
> >>> is done before GuC firmware load. As a result GuC firmware load can time
> >>> out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
> >>> limit was enabled and set to a low value). Therefore disable the PL1 power
> >>> limit when allowed by HW when loading GuC firmware.
> >> v3 label missing in subject.
> >>> v2:
> >>>    - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
> >>>    - Add hwm_power_max_restore to error return code path
> >>>
> >>> v3 (Jani N):
> >>>    - Add/remove explanatory comments
> >>>    - Function renames
> >>>    - Type corrections
> >>>    - Locking annotation
> >>>
> >>> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
> >>> Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> >>> ---
> >>>    drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
> >>>    drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
> >>>    drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
> >>>    3 files changed, 55 insertions(+)
> >>>
> >>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> >>> index 4ccb4be4c9cba..aa8e35a5636a0 100644
> >>> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> >>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> >>> @@ -18,6 +18,7 @@
> >>>    #include "intel_uc.h"
> >>>      #include "i915_drv.h"
> >>> +#include "i915_hwmon.h"
> >>>      static const struct intel_uc_ops uc_ops_off;
> >>>    static const struct intel_uc_ops uc_ops_on;
> >>> @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
> >>>	struct intel_guc *guc = &uc->guc;
> >>>	struct intel_huc *huc = &uc->huc;
> >>>	int ret, attempts;
> >>> +	bool pl1en;
> >> Init to 'false' here
> > See next comment.
> >
> >>
> >>>		GEM_BUG_ON(!intel_uc_supports_guc(uc));
> >>>	GEM_BUG_ON(!intel_uc_wants_guc(uc));
> >>> @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
> >>>	else
> >>>		attempts = 1;
> >>>    +	/* Disable a potentially low PL1 power limit to allow freq to be
> >>> raised */
> >>> +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
> >>> +
> >>>	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
> >>>		while (attempts--) {
> >>> @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> >>>		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> >>>	}
> >>>    +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> >>> +
> >>>	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
> >>>	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
> >>>    @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> >>>	/* Return GT back to RPn */
> >>>	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> >>>    +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> >> if (pl1en)
> >>
> >>      i915_hwmon_power_max_enable().
> > IMO it's better not to have checks in the main __uc_init_hw() function (if
> > we do this we'll need to add 2 checks in __uc_init_hw()). If you really
> > want we could do something like this inside
> > i915_hwmon_power_max_disable/i915_hwmon_power_max_restore. But for now I
> > am not making any changes.
> ok.
> >
> > (I can send a patch with the changes if you want to take a look but IMO it
> > will add more logic/code but without real benefits (it will save a rmw if
> > the limit was already disabled, but IMO this code is called so infrequently
> > (only during GuC resets) as to not have any significant impact)).
> >
> >>> +
> >>>	__uc_sanitize(uc);
> >>>		if (!ret) {
> >>> diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> >>> index ee63a8fd88fc1..769b5bda4d53f 100644
> >>> --- a/drivers/gpu/drm/i915/i915_hwmon.c
> >>> +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> >>> @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
> >>>	}
> >>>    }
> >>>    +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool
> >>> *old)
> >> Shouldn't we call this i915_hwmon_package_pl1_disable()?
> > I did think of using "pl1" in the function name but then decided to retain
> > "power_max" because other hwmon functions for PL1 limit also use
> > "power_max" (hwm_power_max_read/hwm_power_max_write) and currently
> > "hwmon_power_max" is mapped to the PL1 limit. So "power_max" is used to
> > show that all these functions deal with the PL1 power limit.
> >
> > There is a comment in __uc_init_hw() explaining "power_max" means the PL1
> > power limit.
> ok.
> >
> >>> +	__acquires(i915->hwmon->hwmon_lock)
> >>> +{
> >>> +	struct i915_hwmon *hwmon = i915->hwmon;
> >>> +	intel_wakeref_t wakeref;
> >>> +	u32 r;
> >>> +
> >>> +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> >>> +		return;
> >>> +
> >>> +	/* Take mutex to prevent concurrent hwm_power_max_write */
> >>> +	mutex_lock(&hwmon->hwmon_lock);
> >>> +
> >>> +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> >>> +		r = intel_uncore_rmw(hwmon->ddat.uncore,
> >>> +				     hwmon->rg.pkg_rapl_limit,
> >>> +				     PKG_PWR_LIM_1_EN, 0);
> >> Most of this code (lock and rmw parts) is already inside static void
> >> hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?
> > This was the case in v1 of the patch:
> >
> > https://patchwork.freedesktop.org/patch/526393/?series=115003&rev=1
> >
> > But now this cannot be done because if you notice we acquire the mutex in
> > i915_hwmon_power_max_disable() and release the mutex in
> > i915_hwmon_power_max_restore().
> >
> > I explained the reason why this the mutex is handled this way in my reply
> > to Jani Nikula here:
> >
> > https://patchwork.freedesktop.org/patch/526598/?series=115003&rev=2
> >
> > Quoting below:
> >
> > ```
> >>> +	/* hwmon_lock mutex is unlocked in hwm_power_max_restore */
> >> Not too happy about that... any better ideas?
> > Afais, taking the mutex is the only fully correct solution (when we disable
> > the power limit, userspace can go re-enable it). Examples of partly
> > incorrect solutions (which don't take the mutex) include:
> >
> > a. Don't take the mutex, don't do anything, ignore any changes to the value
> >     if it has changed during GuC reset/fw load (just overwrite the changed
> >     value). Con: changed value is lost.
> >
> > b. Detect if the value has changed (the limit has been re-enabled) after we
> >     have disabled the limit and in that case skip restoring the value. But
> >     then someone can say why do we allow enabling the PL1 limit since we
> >     want to disable it.
> >
> > Both these are very unlikely scenarios so they might work. But I would
> > first like to explore if holding a mutex across GuC reset is prolebmatic
> > since that is /the/ correct solution. But if anyone comes up with a reason
> > why that cannot be done we can look at these other not completely correct
> > options.
>
> Well, one reason is that this is adding a lot of duplicate/non-reusable
> code needlessly. If it gets re-used elsewhere, that could lead to some
> weird situations where the lock could be held for an extended period of
> time and introduce dependencies.

The lock will only be held if userspace tries to set the PL1 limit while
GuC load is in progress. The chance of this is low (though not
zero). Otherwise the lock will be uncontended. But otherwise yes the lock
is being held across GuC load. I would think as long as GuC load completes
in the 200 ms (or even a few seconds) it should not be an issue. These
hwmon operations are happening very infrequently, say in a matter of
seconds, at least 100's of milliseconds.

We could add a different lock here but I don't think it's worth it.

> Also, how/why would the user modify this
> PL1 during guc load? The sysfs interfaces are not even ready at this point?
> Even if we consider this during a resume, the terminal will not be
> available to the user.

A few points:

* Agree about probe, the sysfs doesn't exist
* About resume, again agreed. But in general a userspace process (say a
  daemon) might be writing to hwmon sysfs even when a terminal is not
  available. hwmon is accessed by L0 sysman e.g.
* Finally, GuC is not loaded only during probe and resume but GuC can also
  get reset because of error conditions. When the sysfs and the terminal
  are available (e.g what if you set a low PL1 limit and then run
  i915_hangman, it fails currently as was seen in CI, I will try to write
  an IGT for this).

So that's the reason for taking the lock. I think it's not a big problem
even if we take the lock that is why it is in the patch.

> > ```
> >
> >>> +
> >>> +	*old = !!(r & PKG_PWR_LIM_1_EN);
> >>> +}
> >>> +
> >>> +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
> >>> +	__releases(i915->hwmon->hwmon_lock)
> >> We can just call this i915_hwmon_power_max_enable() and call whenever the
> >> old value was actually enabled. That way, we have proper mirror functions.
> > As I explained that would mean adding two checks in the main __uc_init_hw()
> > function which I am trying to avoid. So we have disable/restore pair.
> >
> >>> +{
> >>> +	struct i915_hwmon *hwmon = i915->hwmon;
> >>> +	intel_wakeref_t wakeref;
> >>> +
> >>> +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> >>> +		return;
> >>> +
> >>> +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> >>> +		intel_uncore_rmw(hwmon->ddat.uncore,
> >>> +				 hwmon->rg.pkg_rapl_limit,
> >>> +				 PKG_PWR_LIM_1_EN,
> >>> +				 old ? PKG_PWR_LIM_1_EN : 0);
> >> 3rd param should be 0 here, else we will end up clearing other bits.
> > No see intel_uncore_rmw(), it will only clear the PKG_PWR_LIM_1_EN bit, so
> > the code here is correct. intel_uncore_rmw() does:
> >
> >          val = (old & ~clear) | set;
> Ok, just confusing, since you are also setting it with the 4th param.

No, the 3rd param is clearing (old & ~clear) and the 4th param is setting
(| set). Look at intel_uncore_rmw(). Also I've tested this patch ;-)

Thanks.
--
Ashutosh


> >
> > So for now I am not making any changes, if you feel strongly about
> > something one way or another let me know. Anyway these comments should help
> > you understand the patch better so take a look and we can go from there.
> >
> > Thanks.
> > --
> > Ashutosh
> >
> >>> +
> >>> +	mutex_unlock(&hwmon->hwmon_lock);
> >>> +}
> >>> +
> >>>    static umode_t
> >>>    hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
> >>>    {
> >>> diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
> >>> index 7ca9cf2c34c96..0fcb7de844061 100644
> >>> --- a/drivers/gpu/drm/i915/i915_hwmon.h
> >>> +++ b/drivers/gpu/drm/i915/i915_hwmon.h
> >>> @@ -7,14 +7,21 @@
> >>>    #ifndef __I915_HWMON_H__
> >>>    #define __I915_HWMON_H__
> >>>    +#include <linux/types.h>
> >>> +
> >>>    struct drm_i915_private;
> >>> +struct intel_gt;
> >>>      #if IS_REACHABLE(CONFIG_HWMON)
> >>>    void i915_hwmon_register(struct drm_i915_private *i915);
> >>>    void i915_hwmon_unregister(struct drm_i915_private *i915);
> >>> +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
> >>> +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
> >>>    #else
> >>>    static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
> >>>    static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
> >>> +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
> >>> +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
> >>>    #endif
> >>>      #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
@ 2023-03-27 16:57         ` Dixit, Ashutosh
  0 siblings, 0 replies; 25+ messages in thread
From: Dixit, Ashutosh @ 2023-03-27 16:57 UTC (permalink / raw)
  To: Belgaumkar, Vinay; +Cc: intel-gfx, dri-devel, Rodrigo Vivi

On Fri, 24 Mar 2023 17:06:33 -0700, Belgaumkar, Vinay wrote:
>

Hi Vinay,

> On 3/24/2023 4:31 PM, Dixit, Ashutosh wrote:
> > On Fri, 24 Mar 2023 11:15:02 -0700, Belgaumkar, Vinay wrote:
> > Hi Vinay,
> >
> > Thanks for the review. Comments inline below.
> Sorry about asking the same questions all over again :) Didn't look at
> previous versions.

Np, the previous versions were buried somewhere anyway that's why I
provided the link.

> >
> >> On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
> >>> On dGfx, the PL1 power limit being enabled and set to a low value results
> >>> in a low GPU operating freq. It also negates the freq raise operation which
> >>> is done before GuC firmware load. As a result GuC firmware load can time
> >>> out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
> >>> limit was enabled and set to a low value). Therefore disable the PL1 power
> >>> limit when allowed by HW when loading GuC firmware.
> >> v3 label missing in subject.
> >>> v2:
> >>>    - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
> >>>    - Add hwm_power_max_restore to error return code path
> >>>
> >>> v3 (Jani N):
> >>>    - Add/remove explanatory comments
> >>>    - Function renames
> >>>    - Type corrections
> >>>    - Locking annotation
> >>>
> >>> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
> >>> Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> >>> ---
> >>>    drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
> >>>    drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
> >>>    drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
> >>>    3 files changed, 55 insertions(+)
> >>>
> >>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> >>> index 4ccb4be4c9cba..aa8e35a5636a0 100644
> >>> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> >>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> >>> @@ -18,6 +18,7 @@
> >>>    #include "intel_uc.h"
> >>>      #include "i915_drv.h"
> >>> +#include "i915_hwmon.h"
> >>>      static const struct intel_uc_ops uc_ops_off;
> >>>    static const struct intel_uc_ops uc_ops_on;
> >>> @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
> >>>	struct intel_guc *guc = &uc->guc;
> >>>	struct intel_huc *huc = &uc->huc;
> >>>	int ret, attempts;
> >>> +	bool pl1en;
> >> Init to 'false' here
> > See next comment.
> >
> >>
> >>>		GEM_BUG_ON(!intel_uc_supports_guc(uc));
> >>>	GEM_BUG_ON(!intel_uc_wants_guc(uc));
> >>> @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
> >>>	else
> >>>		attempts = 1;
> >>>    +	/* Disable a potentially low PL1 power limit to allow freq to be
> >>> raised */
> >>> +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
> >>> +
> >>>	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
> >>>		while (attempts--) {
> >>> @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> >>>		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> >>>	}
> >>>    +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> >>> +
> >>>	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
> >>>	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
> >>>    @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> >>>	/* Return GT back to RPn */
> >>>	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> >>>    +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> >> if (pl1en)
> >>
> >>      i915_hwmon_power_max_enable().
> > IMO it's better not to have checks in the main __uc_init_hw() function (if
> > we do this we'll need to add 2 checks in __uc_init_hw()). If you really
> > want we could do something like this inside
> > i915_hwmon_power_max_disable/i915_hwmon_power_max_restore. But for now I
> > am not making any changes.
> ok.
> >
> > (I can send a patch with the changes if you want to take a look but IMO it
> > will add more logic/code but without real benefits (it will save a rmw if
> > the limit was already disabled, but IMO this code is called so infrequently
> > (only during GuC resets) as to not have any significant impact)).
> >
> >>> +
> >>>	__uc_sanitize(uc);
> >>>		if (!ret) {
> >>> diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> >>> index ee63a8fd88fc1..769b5bda4d53f 100644
> >>> --- a/drivers/gpu/drm/i915/i915_hwmon.c
> >>> +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> >>> @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
> >>>	}
> >>>    }
> >>>    +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool
> >>> *old)
> >> Shouldn't we call this i915_hwmon_package_pl1_disable()?
> > I did think of using "pl1" in the function name but then decided to retain
> > "power_max" because other hwmon functions for PL1 limit also use
> > "power_max" (hwm_power_max_read/hwm_power_max_write) and currently
> > "hwmon_power_max" is mapped to the PL1 limit. So "power_max" is used to
> > show that all these functions deal with the PL1 power limit.
> >
> > There is a comment in __uc_init_hw() explaining "power_max" means the PL1
> > power limit.
> ok.
> >
> >>> +	__acquires(i915->hwmon->hwmon_lock)
> >>> +{
> >>> +	struct i915_hwmon *hwmon = i915->hwmon;
> >>> +	intel_wakeref_t wakeref;
> >>> +	u32 r;
> >>> +
> >>> +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> >>> +		return;
> >>> +
> >>> +	/* Take mutex to prevent concurrent hwm_power_max_write */
> >>> +	mutex_lock(&hwmon->hwmon_lock);
> >>> +
> >>> +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> >>> +		r = intel_uncore_rmw(hwmon->ddat.uncore,
> >>> +				     hwmon->rg.pkg_rapl_limit,
> >>> +				     PKG_PWR_LIM_1_EN, 0);
> >> Most of this code (lock and rmw parts) is already inside static void
> >> hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?
> > This was the case in v1 of the patch:
> >
> > https://patchwork.freedesktop.org/patch/526393/?series=115003&rev=1
> >
> > But now this cannot be done because if you notice we acquire the mutex in
> > i915_hwmon_power_max_disable() and release the mutex in
> > i915_hwmon_power_max_restore().
> >
> > I explained the reason why this the mutex is handled this way in my reply
> > to Jani Nikula here:
> >
> > https://patchwork.freedesktop.org/patch/526598/?series=115003&rev=2
> >
> > Quoting below:
> >
> > ```
> >>> +	/* hwmon_lock mutex is unlocked in hwm_power_max_restore */
> >> Not too happy about that... any better ideas?
> > Afais, taking the mutex is the only fully correct solution (when we disable
> > the power limit, userspace can go re-enable it). Examples of partly
> > incorrect solutions (which don't take the mutex) include:
> >
> > a. Don't take the mutex, don't do anything, ignore any changes to the value
> >     if it has changed during GuC reset/fw load (just overwrite the changed
> >     value). Con: changed value is lost.
> >
> > b. Detect if the value has changed (the limit has been re-enabled) after we
> >     have disabled the limit and in that case skip restoring the value. But
> >     then someone can say why do we allow enabling the PL1 limit since we
> >     want to disable it.
> >
> > Both these are very unlikely scenarios so they might work. But I would
> > first like to explore if holding a mutex across GuC reset is prolebmatic
> > since that is /the/ correct solution. But if anyone comes up with a reason
> > why that cannot be done we can look at these other not completely correct
> > options.
>
> Well, one reason is that this is adding a lot of duplicate/non-reusable
> code needlessly. If it gets re-used elsewhere, that could lead to some
> weird situations where the lock could be held for an extended period of
> time and introduce dependencies.

The lock will only be held if userspace tries to set the PL1 limit while
GuC load is in progress. The chance of this is low (though not
zero). Otherwise the lock will be uncontended. But otherwise yes the lock
is being held across GuC load. I would think as long as GuC load completes
in the 200 ms (or even a few seconds) it should not be an issue. These
hwmon operations are happening very infrequently, say in a matter of
seconds, at least 100's of milliseconds.

We could add a different lock here but I don't think it's worth it.

> Also, how/why would the user modify this
> PL1 during guc load? The sysfs interfaces are not even ready at this point?
> Even if we consider this during a resume, the terminal will not be
> available to the user.

A few points:

* Agree about probe, the sysfs doesn't exist
* About resume, again agreed. But in general a userspace process (say a
  daemon) might be writing to hwmon sysfs even when a terminal is not
  available. hwmon is accessed by L0 sysman e.g.
* Finally, GuC is not loaded only during probe and resume but GuC can also
  get reset because of error conditions. When the sysfs and the terminal
  are available (e.g what if you set a low PL1 limit and then run
  i915_hangman, it fails currently as was seen in CI, I will try to write
  an IGT for this).

So that's the reason for taking the lock. I think it's not a big problem
even if we take the lock that is why it is in the patch.

> > ```
> >
> >>> +
> >>> +	*old = !!(r & PKG_PWR_LIM_1_EN);
> >>> +}
> >>> +
> >>> +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
> >>> +	__releases(i915->hwmon->hwmon_lock)
> >> We can just call this i915_hwmon_power_max_enable() and call whenever the
> >> old value was actually enabled. That way, we have proper mirror functions.
> > As I explained that would mean adding two checks in the main __uc_init_hw()
> > function which I am trying to avoid. So we have disable/restore pair.
> >
> >>> +{
> >>> +	struct i915_hwmon *hwmon = i915->hwmon;
> >>> +	intel_wakeref_t wakeref;
> >>> +
> >>> +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> >>> +		return;
> >>> +
> >>> +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> >>> +		intel_uncore_rmw(hwmon->ddat.uncore,
> >>> +				 hwmon->rg.pkg_rapl_limit,
> >>> +				 PKG_PWR_LIM_1_EN,
> >>> +				 old ? PKG_PWR_LIM_1_EN : 0);
> >> 3rd param should be 0 here, else we will end up clearing other bits.
> > No see intel_uncore_rmw(), it will only clear the PKG_PWR_LIM_1_EN bit, so
> > the code here is correct. intel_uncore_rmw() does:
> >
> >          val = (old & ~clear) | set;
> Ok, just confusing, since you are also setting it with the 4th param.

No, the 3rd param is clearing (old & ~clear) and the 4th param is setting
(| set). Look at intel_uncore_rmw(). Also I've tested this patch ;-)

Thanks.
--
Ashutosh


> >
> > So for now I am not making any changes, if you feel strongly about
> > something one way or another let me know. Anyway these comments should help
> > you understand the patch better so take a look and we can go from there.
> >
> > Thanks.
> > --
> > Ashutosh
> >
> >>> +
> >>> +	mutex_unlock(&hwmon->hwmon_lock);
> >>> +}
> >>> +
> >>>    static umode_t
> >>>    hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
> >>>    {
> >>> diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
> >>> index 7ca9cf2c34c96..0fcb7de844061 100644
> >>> --- a/drivers/gpu/drm/i915/i915_hwmon.h
> >>> +++ b/drivers/gpu/drm/i915/i915_hwmon.h
> >>> @@ -7,14 +7,21 @@
> >>>    #ifndef __I915_HWMON_H__
> >>>    #define __I915_HWMON_H__
> >>>    +#include <linux/types.h>
> >>> +
> >>>    struct drm_i915_private;
> >>> +struct intel_gt;
> >>>      #if IS_REACHABLE(CONFIG_HWMON)
> >>>    void i915_hwmon_register(struct drm_i915_private *i915);
> >>>    void i915_hwmon_unregister(struct drm_i915_private *i915);
> >>> +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
> >>> +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
> >>>    #else
> >>>    static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
> >>>    static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
> >>> +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
> >>> +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
> >>>    #endif
> >>>      #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
  2023-03-26 11:52       ` [Intel-gfx] " Rodrigo Vivi
@ 2023-03-27 16:58         ` Dixit, Ashutosh
  -1 siblings, 0 replies; 25+ messages in thread
From: Dixit, Ashutosh @ 2023-03-27 16:58 UTC (permalink / raw)
  To: Rodrigo Vivi
  Cc: Belgaumkar, Vinay, intel-gfx, Badal Nilawar, dri-devel, John Harrison

On Sun, 26 Mar 2023 04:52:59 -0700, Rodrigo Vivi wrote:
>

Hi Rodrigo,

> On Fri, Mar 24, 2023 at 04:31:22PM -0700, Dixit, Ashutosh wrote:
> > On Fri, 24 Mar 2023 11:15:02 -0700, Belgaumkar, Vinay wrote:
> > >
> >
> > Hi Vinay,
> >
> > Thanks for the review. Comments inline below.
> >
> > > On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
> > > > On dGfx, the PL1 power limit being enabled and set to a low value results
> > > > in a low GPU operating freq. It also negates the freq raise operation which
> > > > is done before GuC firmware load. As a result GuC firmware load can time
> > > > out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
> > > > limit was enabled and set to a low value). Therefore disable the PL1 power
> > > > limit when allowed by HW when loading GuC firmware.
> > > v3 label missing in subject.
> > > >
> > > > v2:
> > > >   - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
> > > >   - Add hwm_power_max_restore to error return code path
> > > >
> > > > v3 (Jani N):
> > > >   - Add/remove explanatory comments
> > > >   - Function renames
> > > >   - Type corrections
> > > >   - Locking annotation
> > > >
> > > > Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
> > > > Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> > > > ---
> > > >   drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
> > > >   drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
> > > >   drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
> > > >   3 files changed, 55 insertions(+)
> > > >
> > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > index 4ccb4be4c9cba..aa8e35a5636a0 100644
> > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > @@ -18,6 +18,7 @@
> > > >   #include "intel_uc.h"
> > > >     #include "i915_drv.h"
> > > > +#include "i915_hwmon.h"
> > > >     static const struct intel_uc_ops uc_ops_off;
> > > >   static const struct intel_uc_ops uc_ops_on;
> > > > @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > >	struct intel_guc *guc = &uc->guc;
> > > >	struct intel_huc *huc = &uc->huc;
> > > >	int ret, attempts;
> > > > +	bool pl1en;
> > >
> > > Init to 'false' here
> >
> > See next comment.
> >
> > >
> > >
> > > >		GEM_BUG_ON(!intel_uc_supports_guc(uc));
> > > >	GEM_BUG_ON(!intel_uc_wants_guc(uc));
> > > > @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > >	else
> > > >		attempts = 1;
> > > >   +	/* Disable a potentially low PL1 power limit to allow freq to be
> > > > raised */
> > > > +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
> > > > +
> > > >	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
> > > >		while (attempts--) {
> > > > @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > >		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> > > >	}
> > > >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> > > > +
> > > >	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
> > > >	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
> > > >   @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > >	/* Return GT back to RPn */
> > > >	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> > > >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> > >
> > > if (pl1en)
> > >
> > >     i915_hwmon_power_max_enable().
> >
> > IMO it's better not to have checks in the main __uc_init_hw() function (if
> > we do this we'll need to add 2 checks in __uc_init_hw()). If you really
> > want we could do something like this inside
> > i915_hwmon_power_max_disable/i915_hwmon_power_max_restore. But for now I
> > am not making any changes.
> >
> > (I can send a patch with the changes if you want to take a look but IMO it
> > will add more logic/code but without real benefits (it will save a rmw if
> > the limit was already disabled, but IMO this code is called so infrequently
> > (only during GuC resets) as to not have any significant impact)).
> >
> > >
> > > > +
> > > >	__uc_sanitize(uc);
> > > >		if (!ret) {
> > > > diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> > > > index ee63a8fd88fc1..769b5bda4d53f 100644
> > > > --- a/drivers/gpu/drm/i915/i915_hwmon.c
> > > > +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> > > > @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
> > > >	}
> > > >   }
> > > >   +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool
> > > > *old)
> > > Shouldn't we call this i915_hwmon_package_pl1_disable()?
> >
> > I did think of using "pl1" in the function name but then decided to retain
> > "power_max" because other hwmon functions for PL1 limit also use
> > "power_max" (hwm_power_max_read/hwm_power_max_write) and currently
> > "hwmon_power_max" is mapped to the PL1 limit. So "power_max" is used to
> > show that all these functions deal with the PL1 power limit.
> >
> > There is a comment in __uc_init_hw() explaining "power_max" means the PL1
> > power limit.
> >
> > > > +	__acquires(i915->hwmon->hwmon_lock)
> > > > +{
> > > > +	struct i915_hwmon *hwmon = i915->hwmon;
> > > > +	intel_wakeref_t wakeref;
> > > > +	u32 r;
> > > > +
> > > > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > > > +		return;
> > > > +
> > > > +	/* Take mutex to prevent concurrent hwm_power_max_write */
> > > > +	mutex_lock(&hwmon->hwmon_lock);
> > > > +
> > > > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > > > +		r = intel_uncore_rmw(hwmon->ddat.uncore,
> > > > +				     hwmon->rg.pkg_rapl_limit,
> > > > +				     PKG_PWR_LIM_1_EN, 0);
> > > Most of this code (lock and rmw parts) is already inside static void
> > > hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?
> >
> > This was the case in v1 of the patch:
> >
> > https://patchwork.freedesktop.org/patch/526393/?series=115003&rev=1
> >
> > But now this cannot be done because if you notice we acquire the mutex in
> > i915_hwmon_power_max_disable() and release the mutex in
> > i915_hwmon_power_max_restore().
> >
> > I explained the reason why this the mutex is handled this way in my reply
> > to Jani Nikula here:
> >
> > https://patchwork.freedesktop.org/patch/526598/?series=115003&rev=2
> >
> > Quoting below:
> >
> > ```
> > > > +	/* hwmon_lock mutex is unlocked in hwm_power_max_restore */
> > >
> > > Not too happy about that... any better ideas?
> >
> > Afais, taking the mutex is the only fully correct solution (when we disable
> > the power limit, userspace can go re-enable it). Examples of partly
> > incorrect solutions (which don't take the mutex) include:
> >
> > a. Don't take the mutex, don't do anything, ignore any changes to the value
> >    if it has changed during GuC reset/fw load (just overwrite the changed
> >    value). Con: changed value is lost.
> >
> > b. Detect if the value has changed (the limit has been re-enabled) after we
> >    have disabled the limit and in that case skip restoring the value. But
> >    then someone can say why do we allow enabling the PL1 limit since we
> >    want to disable it.
> >
> > Both these are very unlikely scenarios so they might work. But I would
> > first like to explore if holding a mutex across GuC reset is prolebmatic
> > since that is /the/ correct solution. But if anyone comes up with a reason
> > why that cannot be done we can look at these other not completely correct
> > options.
>
> I see what you are doing and it looks indeed a very safe approach to ensure
> the pl1 won't be toggled by other paths while we need some guaranteed state
> here, or hw init fails badly.
>
> But in the end you are making your lock to protect the code from another path
> and not protecting the data itself. The data was already protected in the
> first version with the lock in the rmw.

Sorry I am not really following. Daniel had mentioned this "protecting code
vs protecting data" but I am wondering how it is applicable in this
case. IMO here the data we are protecting is the register which we don't
want written to by userland while GuC load is in progress. To do that we
need to block the code path writing to register. So what we have here seems
to me to be the simplest and cleanest approach for solving this issue.

> maybe we need to have some kind of a state check with other state-lock and
> then if we are in this forced state for init path, the request for the normal path
> ignores and move one,

I don't see how this will *not* be racy...

> or maybe we queue some request...

Queuing a request will not be enough (even if this is possible), the
request will need to wait to complete till GuC load completes. So we'll
have to complete the request when GuC load completes, similar to releasing
the mutex in the current patch. Looks like a much more complicated way of
doing what the mutex does very simply.

So:

a. What is the real problem with the current implementation?

b. What would be the correct solution for it? That is how, specifically,
   should we implement it?

Some more guidance will be helpful if you think this patch has issues.

Thanks.
--
Ashutosh

> > ```
> >
> > > > +
> > > > +	*old = !!(r & PKG_PWR_LIM_1_EN);
> > > > +}
> > > > +
> > > > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
> > > > +	__releases(i915->hwmon->hwmon_lock)
> > > We can just call this i915_hwmon_power_max_enable() and call whenever the
> > > old value was actually enabled. That way, we have proper mirror functions.
> >
> > As I explained that would mean adding two checks in the main __uc_init_hw()
> > function which I am trying to avoid. So we have disable/restore pair.
> >
> > > > +{
> > > > +	struct i915_hwmon *hwmon = i915->hwmon;
> > > > +	intel_wakeref_t wakeref;
> > > > +
> > > > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > > > +		return;
> > > > +
> > > > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > > > +		intel_uncore_rmw(hwmon->ddat.uncore,
> > > > +				 hwmon->rg.pkg_rapl_limit,
> > > > +				 PKG_PWR_LIM_1_EN,
> > > > +				 old ? PKG_PWR_LIM_1_EN : 0);
> > >
> > > 3rd param should be 0 here, else we will end up clearing other bits.
> >
> > No see intel_uncore_rmw(), it will only clear the PKG_PWR_LIM_1_EN bit, so
> > the code here is correct. intel_uncore_rmw() does:
> >
> >         val = (old & ~clear) | set;
> >
> > So for now I am not making any changes, if you feel strongly about
> > something one way or another let me know. Anyway these comments should help
> > you understand the patch better so take a look and we can go from there.
> >
> > Thanks.
> > --
> > Ashutosh
> >
> > > > +
> > > > +	mutex_unlock(&hwmon->hwmon_lock);
> > > > +}
> > > > +
> > > >   static umode_t
> > > >   hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
> > > >   {
> > > > diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
> > > > index 7ca9cf2c34c96..0fcb7de844061 100644
> > > > --- a/drivers/gpu/drm/i915/i915_hwmon.h
> > > > +++ b/drivers/gpu/drm/i915/i915_hwmon.h
> > > > @@ -7,14 +7,21 @@
> > > >   #ifndef __I915_HWMON_H__
> > > >   #define __I915_HWMON_H__
> > > >   +#include <linux/types.h>
> > > > +
> > > >   struct drm_i915_private;
> > > > +struct intel_gt;
> > > >     #if IS_REACHABLE(CONFIG_HWMON)
> > > >   void i915_hwmon_register(struct drm_i915_private *i915);
> > > >   void i915_hwmon_unregister(struct drm_i915_private *i915);
> > > > +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
> > > > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
> > > >   #else
> > > >   static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
> > > >   static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
> > > > +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
> > > > +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
> > > >   #endif
> > > >     #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
@ 2023-03-27 16:58         ` Dixit, Ashutosh
  0 siblings, 0 replies; 25+ messages in thread
From: Dixit, Ashutosh @ 2023-03-27 16:58 UTC (permalink / raw)
  To: Rodrigo Vivi; +Cc: intel-gfx, dri-devel

On Sun, 26 Mar 2023 04:52:59 -0700, Rodrigo Vivi wrote:
>

Hi Rodrigo,

> On Fri, Mar 24, 2023 at 04:31:22PM -0700, Dixit, Ashutosh wrote:
> > On Fri, 24 Mar 2023 11:15:02 -0700, Belgaumkar, Vinay wrote:
> > >
> >
> > Hi Vinay,
> >
> > Thanks for the review. Comments inline below.
> >
> > > On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
> > > > On dGfx, the PL1 power limit being enabled and set to a low value results
> > > > in a low GPU operating freq. It also negates the freq raise operation which
> > > > is done before GuC firmware load. As a result GuC firmware load can time
> > > > out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
> > > > limit was enabled and set to a low value). Therefore disable the PL1 power
> > > > limit when allowed by HW when loading GuC firmware.
> > > v3 label missing in subject.
> > > >
> > > > v2:
> > > >   - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
> > > >   - Add hwm_power_max_restore to error return code path
> > > >
> > > > v3 (Jani N):
> > > >   - Add/remove explanatory comments
> > > >   - Function renames
> > > >   - Type corrections
> > > >   - Locking annotation
> > > >
> > > > Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
> > > > Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> > > > ---
> > > >   drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
> > > >   drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
> > > >   drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
> > > >   3 files changed, 55 insertions(+)
> > > >
> > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > index 4ccb4be4c9cba..aa8e35a5636a0 100644
> > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > @@ -18,6 +18,7 @@
> > > >   #include "intel_uc.h"
> > > >     #include "i915_drv.h"
> > > > +#include "i915_hwmon.h"
> > > >     static const struct intel_uc_ops uc_ops_off;
> > > >   static const struct intel_uc_ops uc_ops_on;
> > > > @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > >	struct intel_guc *guc = &uc->guc;
> > > >	struct intel_huc *huc = &uc->huc;
> > > >	int ret, attempts;
> > > > +	bool pl1en;
> > >
> > > Init to 'false' here
> >
> > See next comment.
> >
> > >
> > >
> > > >		GEM_BUG_ON(!intel_uc_supports_guc(uc));
> > > >	GEM_BUG_ON(!intel_uc_wants_guc(uc));
> > > > @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > >	else
> > > >		attempts = 1;
> > > >   +	/* Disable a potentially low PL1 power limit to allow freq to be
> > > > raised */
> > > > +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
> > > > +
> > > >	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
> > > >		while (attempts--) {
> > > > @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > >		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> > > >	}
> > > >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> > > > +
> > > >	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
> > > >	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
> > > >   @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > >	/* Return GT back to RPn */
> > > >	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> > > >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> > >
> > > if (pl1en)
> > >
> > >     i915_hwmon_power_max_enable().
> >
> > IMO it's better not to have checks in the main __uc_init_hw() function (if
> > we do this we'll need to add 2 checks in __uc_init_hw()). If you really
> > want we could do something like this inside
> > i915_hwmon_power_max_disable/i915_hwmon_power_max_restore. But for now I
> > am not making any changes.
> >
> > (I can send a patch with the changes if you want to take a look but IMO it
> > will add more logic/code but without real benefits (it will save a rmw if
> > the limit was already disabled, but IMO this code is called so infrequently
> > (only during GuC resets) as to not have any significant impact)).
> >
> > >
> > > > +
> > > >	__uc_sanitize(uc);
> > > >		if (!ret) {
> > > > diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> > > > index ee63a8fd88fc1..769b5bda4d53f 100644
> > > > --- a/drivers/gpu/drm/i915/i915_hwmon.c
> > > > +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> > > > @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
> > > >	}
> > > >   }
> > > >   +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool
> > > > *old)
> > > Shouldn't we call this i915_hwmon_package_pl1_disable()?
> >
> > I did think of using "pl1" in the function name but then decided to retain
> > "power_max" because other hwmon functions for PL1 limit also use
> > "power_max" (hwm_power_max_read/hwm_power_max_write) and currently
> > "hwmon_power_max" is mapped to the PL1 limit. So "power_max" is used to
> > show that all these functions deal with the PL1 power limit.
> >
> > There is a comment in __uc_init_hw() explaining "power_max" means the PL1
> > power limit.
> >
> > > > +	__acquires(i915->hwmon->hwmon_lock)
> > > > +{
> > > > +	struct i915_hwmon *hwmon = i915->hwmon;
> > > > +	intel_wakeref_t wakeref;
> > > > +	u32 r;
> > > > +
> > > > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > > > +		return;
> > > > +
> > > > +	/* Take mutex to prevent concurrent hwm_power_max_write */
> > > > +	mutex_lock(&hwmon->hwmon_lock);
> > > > +
> > > > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > > > +		r = intel_uncore_rmw(hwmon->ddat.uncore,
> > > > +				     hwmon->rg.pkg_rapl_limit,
> > > > +				     PKG_PWR_LIM_1_EN, 0);
> > > Most of this code (lock and rmw parts) is already inside static void
> > > hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?
> >
> > This was the case in v1 of the patch:
> >
> > https://patchwork.freedesktop.org/patch/526393/?series=115003&rev=1
> >
> > But now this cannot be done because if you notice we acquire the mutex in
> > i915_hwmon_power_max_disable() and release the mutex in
> > i915_hwmon_power_max_restore().
> >
> > I explained the reason why this the mutex is handled this way in my reply
> > to Jani Nikula here:
> >
> > https://patchwork.freedesktop.org/patch/526598/?series=115003&rev=2
> >
> > Quoting below:
> >
> > ```
> > > > +	/* hwmon_lock mutex is unlocked in hwm_power_max_restore */
> > >
> > > Not too happy about that... any better ideas?
> >
> > Afais, taking the mutex is the only fully correct solution (when we disable
> > the power limit, userspace can go re-enable it). Examples of partly
> > incorrect solutions (which don't take the mutex) include:
> >
> > a. Don't take the mutex, don't do anything, ignore any changes to the value
> >    if it has changed during GuC reset/fw load (just overwrite the changed
> >    value). Con: changed value is lost.
> >
> > b. Detect if the value has changed (the limit has been re-enabled) after we
> >    have disabled the limit and in that case skip restoring the value. But
> >    then someone can say why do we allow enabling the PL1 limit since we
> >    want to disable it.
> >
> > Both these are very unlikely scenarios so they might work. But I would
> > first like to explore if holding a mutex across GuC reset is prolebmatic
> > since that is /the/ correct solution. But if anyone comes up with a reason
> > why that cannot be done we can look at these other not completely correct
> > options.
>
> I see what you are doing and it looks indeed a very safe approach to ensure
> the pl1 won't be toggled by other paths while we need some guaranteed state
> here, or hw init fails badly.
>
> But in the end you are making your lock to protect the code from another path
> and not protecting the data itself. The data was already protected in the
> first version with the lock in the rmw.

Sorry I am not really following. Daniel had mentioned this "protecting code
vs protecting data" but I am wondering how it is applicable in this
case. IMO here the data we are protecting is the register which we don't
want written to by userland while GuC load is in progress. To do that we
need to block the code path writing to register. So what we have here seems
to me to be the simplest and cleanest approach for solving this issue.

> maybe we need to have some kind of a state check with other state-lock and
> then if we are in this forced state for init path, the request for the normal path
> ignores and move one,

I don't see how this will *not* be racy...

> or maybe we queue some request...

Queuing a request will not be enough (even if this is possible), the
request will need to wait to complete till GuC load completes. So we'll
have to complete the request when GuC load completes, similar to releasing
the mutex in the current patch. Looks like a much more complicated way of
doing what the mutex does very simply.

So:

a. What is the real problem with the current implementation?

b. What would be the correct solution for it? That is how, specifically,
   should we implement it?

Some more guidance will be helpful if you think this patch has issues.

Thanks.
--
Ashutosh

> > ```
> >
> > > > +
> > > > +	*old = !!(r & PKG_PWR_LIM_1_EN);
> > > > +}
> > > > +
> > > > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
> > > > +	__releases(i915->hwmon->hwmon_lock)
> > > We can just call this i915_hwmon_power_max_enable() and call whenever the
> > > old value was actually enabled. That way, we have proper mirror functions.
> >
> > As I explained that would mean adding two checks in the main __uc_init_hw()
> > function which I am trying to avoid. So we have disable/restore pair.
> >
> > > > +{
> > > > +	struct i915_hwmon *hwmon = i915->hwmon;
> > > > +	intel_wakeref_t wakeref;
> > > > +
> > > > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > > > +		return;
> > > > +
> > > > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > > > +		intel_uncore_rmw(hwmon->ddat.uncore,
> > > > +				 hwmon->rg.pkg_rapl_limit,
> > > > +				 PKG_PWR_LIM_1_EN,
> > > > +				 old ? PKG_PWR_LIM_1_EN : 0);
> > >
> > > 3rd param should be 0 here, else we will end up clearing other bits.
> >
> > No see intel_uncore_rmw(), it will only clear the PKG_PWR_LIM_1_EN bit, so
> > the code here is correct. intel_uncore_rmw() does:
> >
> >         val = (old & ~clear) | set;
> >
> > So for now I am not making any changes, if you feel strongly about
> > something one way or another let me know. Anyway these comments should help
> > you understand the patch better so take a look and we can go from there.
> >
> > Thanks.
> > --
> > Ashutosh
> >
> > > > +
> > > > +	mutex_unlock(&hwmon->hwmon_lock);
> > > > +}
> > > > +
> > > >   static umode_t
> > > >   hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
> > > >   {
> > > > diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
> > > > index 7ca9cf2c34c96..0fcb7de844061 100644
> > > > --- a/drivers/gpu/drm/i915/i915_hwmon.h
> > > > +++ b/drivers/gpu/drm/i915/i915_hwmon.h
> > > > @@ -7,14 +7,21 @@
> > > >   #ifndef __I915_HWMON_H__
> > > >   #define __I915_HWMON_H__
> > > >   +#include <linux/types.h>
> > > > +
> > > >   struct drm_i915_private;
> > > > +struct intel_gt;
> > > >     #if IS_REACHABLE(CONFIG_HWMON)
> > > >   void i915_hwmon_register(struct drm_i915_private *i915);
> > > >   void i915_hwmon_unregister(struct drm_i915_private *i915);
> > > > +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
> > > > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
> > > >   #else
> > > >   static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
> > > >   static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
> > > > +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
> > > > +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
> > > >   #endif
> > > >     #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
  2023-03-27 16:58         ` [Intel-gfx] " Dixit, Ashutosh
@ 2023-03-27 17:47           ` Rodrigo Vivi
  -1 siblings, 0 replies; 25+ messages in thread
From: Rodrigo Vivi @ 2023-03-27 17:47 UTC (permalink / raw)
  To: Dixit, Ashutosh, Daniel Vetter
  Cc: Jani Nikula, Tvrtko Ursulin, intel-gfx, dri-devel


+Daniel

On Mon, Mar 27, 2023 at 09:58:52AM -0700, Dixit, Ashutosh wrote:
> On Sun, 26 Mar 2023 04:52:59 -0700, Rodrigo Vivi wrote:
> >
> 
> Hi Rodrigo,
> 
> > On Fri, Mar 24, 2023 at 04:31:22PM -0700, Dixit, Ashutosh wrote:
> > > On Fri, 24 Mar 2023 11:15:02 -0700, Belgaumkar, Vinay wrote:
> > > >
> > >
> > > Hi Vinay,
> > >
> > > Thanks for the review. Comments inline below.
> > >
> > > > On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
> > > > > On dGfx, the PL1 power limit being enabled and set to a low value results
> > > > > in a low GPU operating freq. It also negates the freq raise operation which
> > > > > is done before GuC firmware load. As a result GuC firmware load can time
> > > > > out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
> > > > > limit was enabled and set to a low value). Therefore disable the PL1 power
> > > > > limit when allowed by HW when loading GuC firmware.
> > > > v3 label missing in subject.
> > > > >
> > > > > v2:
> > > > >   - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
> > > > >   - Add hwm_power_max_restore to error return code path
> > > > >
> > > > > v3 (Jani N):
> > > > >   - Add/remove explanatory comments
> > > > >   - Function renames
> > > > >   - Type corrections
> > > > >   - Locking annotation
> > > > >
> > > > > Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
> > > > > Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> > > > > ---
> > > > >   drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
> > > > >   drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
> > > > >   drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
> > > > >   3 files changed, 55 insertions(+)
> > > > >
> > > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > > index 4ccb4be4c9cba..aa8e35a5636a0 100644
> > > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > > @@ -18,6 +18,7 @@
> > > > >   #include "intel_uc.h"
> > > > >     #include "i915_drv.h"
> > > > > +#include "i915_hwmon.h"
> > > > >     static const struct intel_uc_ops uc_ops_off;
> > > > >   static const struct intel_uc_ops uc_ops_on;
> > > > > @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > > >	struct intel_guc *guc = &uc->guc;
> > > > >	struct intel_huc *huc = &uc->huc;
> > > > >	int ret, attempts;
> > > > > +	bool pl1en;
> > > >
> > > > Init to 'false' here
> > >
> > > See next comment.
> > >
> > > >
> > > >
> > > > >		GEM_BUG_ON(!intel_uc_supports_guc(uc));
> > > > >	GEM_BUG_ON(!intel_uc_wants_guc(uc));
> > > > > @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > > >	else
> > > > >		attempts = 1;
> > > > >   +	/* Disable a potentially low PL1 power limit to allow freq to be
> > > > > raised */
> > > > > +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
> > > > > +
> > > > >	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
> > > > >		while (attempts--) {
> > > > > @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > > >		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> > > > >	}
> > > > >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> > > > > +
> > > > >	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
> > > > >	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
> > > > >   @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > > >	/* Return GT back to RPn */
> > > > >	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> > > > >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> > > >
> > > > if (pl1en)
> > > >
> > > >     i915_hwmon_power_max_enable().
> > >
> > > IMO it's better not to have checks in the main __uc_init_hw() function (if
> > > we do this we'll need to add 2 checks in __uc_init_hw()). If you really
> > > want we could do something like this inside
> > > i915_hwmon_power_max_disable/i915_hwmon_power_max_restore. But for now I
> > > am not making any changes.
> > >
> > > (I can send a patch with the changes if you want to take a look but IMO it
> > > will add more logic/code but without real benefits (it will save a rmw if
> > > the limit was already disabled, but IMO this code is called so infrequently
> > > (only during GuC resets) as to not have any significant impact)).
> > >
> > > >
> > > > > +
> > > > >	__uc_sanitize(uc);
> > > > >		if (!ret) {
> > > > > diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> > > > > index ee63a8fd88fc1..769b5bda4d53f 100644
> > > > > --- a/drivers/gpu/drm/i915/i915_hwmon.c
> > > > > +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> > > > > @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
> > > > >	}
> > > > >   }
> > > > >   +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool
> > > > > *old)
> > > > Shouldn't we call this i915_hwmon_package_pl1_disable()?
> > >
> > > I did think of using "pl1" in the function name but then decided to retain
> > > "power_max" because other hwmon functions for PL1 limit also use
> > > "power_max" (hwm_power_max_read/hwm_power_max_write) and currently
> > > "hwmon_power_max" is mapped to the PL1 limit. So "power_max" is used to
> > > show that all these functions deal with the PL1 power limit.
> > >
> > > There is a comment in __uc_init_hw() explaining "power_max" means the PL1
> > > power limit.
> > >
> > > > > +	__acquires(i915->hwmon->hwmon_lock)
> > > > > +{
> > > > > +	struct i915_hwmon *hwmon = i915->hwmon;
> > > > > +	intel_wakeref_t wakeref;
> > > > > +	u32 r;
> > > > > +
> > > > > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > > > > +		return;
> > > > > +
> > > > > +	/* Take mutex to prevent concurrent hwm_power_max_write */
> > > > > +	mutex_lock(&hwmon->hwmon_lock);
> > > > > +
> > > > > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > > > > +		r = intel_uncore_rmw(hwmon->ddat.uncore,
> > > > > +				     hwmon->rg.pkg_rapl_limit,
> > > > > +				     PKG_PWR_LIM_1_EN, 0);
> > > > Most of this code (lock and rmw parts) is already inside static void
> > > > hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?
> > >
> > > This was the case in v1 of the patch:
> > >
> > > https://patchwork.freedesktop.org/patch/526393/?series=115003&rev=1
> > >
> > > But now this cannot be done because if you notice we acquire the mutex in
> > > i915_hwmon_power_max_disable() and release the mutex in
> > > i915_hwmon_power_max_restore().
> > >
> > > I explained the reason why this the mutex is handled this way in my reply
> > > to Jani Nikula here:
> > >
> > > https://patchwork.freedesktop.org/patch/526598/?series=115003&rev=2
> > >
> > > Quoting below:
> > >
> > > ```
> > > > > +	/* hwmon_lock mutex is unlocked in hwm_power_max_restore */
> > > >
> > > > Not too happy about that... any better ideas?
> > >
> > > Afais, taking the mutex is the only fully correct solution (when we disable
> > > the power limit, userspace can go re-enable it). Examples of partly
> > > incorrect solutions (which don't take the mutex) include:
> > >
> > > a. Don't take the mutex, don't do anything, ignore any changes to the value
> > >    if it has changed during GuC reset/fw load (just overwrite the changed
> > >    value). Con: changed value is lost.
> > >
> > > b. Detect if the value has changed (the limit has been re-enabled) after we
> > >    have disabled the limit and in that case skip restoring the value. But
> > >    then someone can say why do we allow enabling the PL1 limit since we
> > >    want to disable it.
> > >
> > > Both these are very unlikely scenarios so they might work. But I would
> > > first like to explore if holding a mutex across GuC reset is prolebmatic
> > > since that is /the/ correct solution. But if anyone comes up with a reason
> > > why that cannot be done we can look at these other not completely correct
> > > options.
> >
> > I see what you are doing and it looks indeed a very safe approach to ensure
> > the pl1 won't be toggled by other paths while we need some guaranteed state
> > here, or hw init fails badly.
> >
> > But in the end you are making your lock to protect the code from another path
> > and not protecting the data itself. The data was already protected in the
> > first version with the lock in the rmw.
> 
> Sorry I am not really following. Daniel had mentioned this "protecting code
> vs protecting data" but I am wondering how it is applicable in this
> case. IMO here the data we are protecting is the register which we don't
> want written to by userland while GuC load is in progress. To do that we
> need to block the code path writing to register. So what we have here seems
> to me to be the simplest and cleanest approach for solving this issue.

I believe your cases here is exactly what Daniel had mentioned as protecting
code and not data. Well, in the end we are of course protecting data to be
modified, but in your case you use that mutex to also protect the code path
and avoid other calls while you are in this guc_init_path...

Please Daniel, correct me here if I got it wrong.

What I don't like here is that we lock from one function and keep that for a
while and unlock from the other function. To protect the data itself in general
we just need for a very minimal time while we are modifying the data itself.

> 
> > maybe we need to have some kind of a state check with other state-lock and
> > then if we are in this forced state for init path, the request for the normal path
> > ignores and move one,
> 
> I don't see how this will *not* be racy...

maybe something like this?:

at power_max_disable:
mutex_lock(data_lock);

mutex_lock(state_lock);
state = in_use;
mutex_unlock(state_lock);

mmio_rmw();
mutex_unlock(data_lock);


at power_max_restoration:

at power_max_disable:
mutex_lock(data_lock);

mutex_lock(state_lock);
state = available;
mutex_unlock(state_lock);

mmio_rmw();
mutex_unlock(data_lock);

at sysfs fn:

mutex_lock(data_lock);
mutex_lock(state_lock);
if (state == in_use) {
   ret = -EAGAIN
   goto out;
}
mutex_unlock(state_lock);

....

out:

mutex_unlock(data_lock);

> 
> > or maybe we queue some request...
> 
> Queuing a request will not be enough (even if this is possible), the
> request will need to wait to complete till GuC load completes. So we'll
> have to complete the request when GuC load completes, similar to releasing
> the mutex in the current patch. Looks like a much more complicated way of
> doing what the mutex does very simply.

The wq would sleep/delay while state == in_use, then process the next request...

> 
> So:
> 
> a. What is the real problem with the current implementation?

probably the big lock used to protect the state machinery...

but if other folks believe that we don't have an actual problem here
and this big lock is acceptable as long as it has the annotation for
the static analyzers, I'm okay to just let it go...


> 
> b. What would be the correct solution for it? That is how, specifically,
>    should we implement it?

state handling with separated lock from the data itself is my suggestion.

> 
> Some more guidance will be helpful if you think this patch has issues.

I hope Daniel and/or other i915 maintainers can jump here. Specially if
I'm being to paranoid and the current patch is enough...

> 
> Thanks.
> --
> Ashutosh
> 
> > > ```
> > >
> > > > > +
> > > > > +	*old = !!(r & PKG_PWR_LIM_1_EN);
> > > > > +}
> > > > > +
> > > > > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
> > > > > +	__releases(i915->hwmon->hwmon_lock)
> > > > We can just call this i915_hwmon_power_max_enable() and call whenever the
> > > > old value was actually enabled. That way, we have proper mirror functions.
> > >
> > > As I explained that would mean adding two checks in the main __uc_init_hw()
> > > function which I am trying to avoid. So we have disable/restore pair.
> > >
> > > > > +{
> > > > > +	struct i915_hwmon *hwmon = i915->hwmon;
> > > > > +	intel_wakeref_t wakeref;
> > > > > +
> > > > > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > > > > +		return;
> > > > > +
> > > > > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > > > > +		intel_uncore_rmw(hwmon->ddat.uncore,
> > > > > +				 hwmon->rg.pkg_rapl_limit,
> > > > > +				 PKG_PWR_LIM_1_EN,
> > > > > +				 old ? PKG_PWR_LIM_1_EN : 0);
> > > >
> > > > 3rd param should be 0 here, else we will end up clearing other bits.
> > >
> > > No see intel_uncore_rmw(), it will only clear the PKG_PWR_LIM_1_EN bit, so
> > > the code here is correct. intel_uncore_rmw() does:
> > >
> > >         val = (old & ~clear) | set;
> > >
> > > So for now I am not making any changes, if you feel strongly about
> > > something one way or another let me know. Anyway these comments should help
> > > you understand the patch better so take a look and we can go from there.
> > >
> > > Thanks.
> > > --
> > > Ashutosh
> > >
> > > > > +
> > > > > +	mutex_unlock(&hwmon->hwmon_lock);
> > > > > +}
> > > > > +
> > > > >   static umode_t
> > > > >   hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
> > > > >   {
> > > > > diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
> > > > > index 7ca9cf2c34c96..0fcb7de844061 100644
> > > > > --- a/drivers/gpu/drm/i915/i915_hwmon.h
> > > > > +++ b/drivers/gpu/drm/i915/i915_hwmon.h
> > > > > @@ -7,14 +7,21 @@
> > > > >   #ifndef __I915_HWMON_H__
> > > > >   #define __I915_HWMON_H__
> > > > >   +#include <linux/types.h>
> > > > > +
> > > > >   struct drm_i915_private;
> > > > > +struct intel_gt;
> > > > >     #if IS_REACHABLE(CONFIG_HWMON)
> > > > >   void i915_hwmon_register(struct drm_i915_private *i915);
> > > > >   void i915_hwmon_unregister(struct drm_i915_private *i915);
> > > > > +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
> > > > > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
> > > > >   #else
> > > > >   static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
> > > > >   static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
> > > > > +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
> > > > > +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
> > > > >   #endif
> > > > >     #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
@ 2023-03-27 17:47           ` Rodrigo Vivi
  0 siblings, 0 replies; 25+ messages in thread
From: Rodrigo Vivi @ 2023-03-27 17:47 UTC (permalink / raw)
  To: Dixit, Ashutosh, Daniel Vetter; +Cc: Jani Nikula, intel-gfx, dri-devel


+Daniel

On Mon, Mar 27, 2023 at 09:58:52AM -0700, Dixit, Ashutosh wrote:
> On Sun, 26 Mar 2023 04:52:59 -0700, Rodrigo Vivi wrote:
> >
> 
> Hi Rodrigo,
> 
> > On Fri, Mar 24, 2023 at 04:31:22PM -0700, Dixit, Ashutosh wrote:
> > > On Fri, 24 Mar 2023 11:15:02 -0700, Belgaumkar, Vinay wrote:
> > > >
> > >
> > > Hi Vinay,
> > >
> > > Thanks for the review. Comments inline below.
> > >
> > > > On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
> > > > > On dGfx, the PL1 power limit being enabled and set to a low value results
> > > > > in a low GPU operating freq. It also negates the freq raise operation which
> > > > > is done before GuC firmware load. As a result GuC firmware load can time
> > > > > out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
> > > > > limit was enabled and set to a low value). Therefore disable the PL1 power
> > > > > limit when allowed by HW when loading GuC firmware.
> > > > v3 label missing in subject.
> > > > >
> > > > > v2:
> > > > >   - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
> > > > >   - Add hwm_power_max_restore to error return code path
> > > > >
> > > > > v3 (Jani N):
> > > > >   - Add/remove explanatory comments
> > > > >   - Function renames
> > > > >   - Type corrections
> > > > >   - Locking annotation
> > > > >
> > > > > Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
> > > > > Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> > > > > ---
> > > > >   drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
> > > > >   drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
> > > > >   drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
> > > > >   3 files changed, 55 insertions(+)
> > > > >
> > > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > > index 4ccb4be4c9cba..aa8e35a5636a0 100644
> > > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > > @@ -18,6 +18,7 @@
> > > > >   #include "intel_uc.h"
> > > > >     #include "i915_drv.h"
> > > > > +#include "i915_hwmon.h"
> > > > >     static const struct intel_uc_ops uc_ops_off;
> > > > >   static const struct intel_uc_ops uc_ops_on;
> > > > > @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > > >	struct intel_guc *guc = &uc->guc;
> > > > >	struct intel_huc *huc = &uc->huc;
> > > > >	int ret, attempts;
> > > > > +	bool pl1en;
> > > >
> > > > Init to 'false' here
> > >
> > > See next comment.
> > >
> > > >
> > > >
> > > > >		GEM_BUG_ON(!intel_uc_supports_guc(uc));
> > > > >	GEM_BUG_ON(!intel_uc_wants_guc(uc));
> > > > > @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > > >	else
> > > > >		attempts = 1;
> > > > >   +	/* Disable a potentially low PL1 power limit to allow freq to be
> > > > > raised */
> > > > > +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
> > > > > +
> > > > >	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
> > > > >		while (attempts--) {
> > > > > @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > > >		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> > > > >	}
> > > > >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> > > > > +
> > > > >	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
> > > > >	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
> > > > >   @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > > >	/* Return GT back to RPn */
> > > > >	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> > > > >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> > > >
> > > > if (pl1en)
> > > >
> > > >     i915_hwmon_power_max_enable().
> > >
> > > IMO it's better not to have checks in the main __uc_init_hw() function (if
> > > we do this we'll need to add 2 checks in __uc_init_hw()). If you really
> > > want we could do something like this inside
> > > i915_hwmon_power_max_disable/i915_hwmon_power_max_restore. But for now I
> > > am not making any changes.
> > >
> > > (I can send a patch with the changes if you want to take a look but IMO it
> > > will add more logic/code but without real benefits (it will save a rmw if
> > > the limit was already disabled, but IMO this code is called so infrequently
> > > (only during GuC resets) as to not have any significant impact)).
> > >
> > > >
> > > > > +
> > > > >	__uc_sanitize(uc);
> > > > >		if (!ret) {
> > > > > diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> > > > > index ee63a8fd88fc1..769b5bda4d53f 100644
> > > > > --- a/drivers/gpu/drm/i915/i915_hwmon.c
> > > > > +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> > > > > @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
> > > > >	}
> > > > >   }
> > > > >   +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool
> > > > > *old)
> > > > Shouldn't we call this i915_hwmon_package_pl1_disable()?
> > >
> > > I did think of using "pl1" in the function name but then decided to retain
> > > "power_max" because other hwmon functions for PL1 limit also use
> > > "power_max" (hwm_power_max_read/hwm_power_max_write) and currently
> > > "hwmon_power_max" is mapped to the PL1 limit. So "power_max" is used to
> > > show that all these functions deal with the PL1 power limit.
> > >
> > > There is a comment in __uc_init_hw() explaining "power_max" means the PL1
> > > power limit.
> > >
> > > > > +	__acquires(i915->hwmon->hwmon_lock)
> > > > > +{
> > > > > +	struct i915_hwmon *hwmon = i915->hwmon;
> > > > > +	intel_wakeref_t wakeref;
> > > > > +	u32 r;
> > > > > +
> > > > > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > > > > +		return;
> > > > > +
> > > > > +	/* Take mutex to prevent concurrent hwm_power_max_write */
> > > > > +	mutex_lock(&hwmon->hwmon_lock);
> > > > > +
> > > > > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > > > > +		r = intel_uncore_rmw(hwmon->ddat.uncore,
> > > > > +				     hwmon->rg.pkg_rapl_limit,
> > > > > +				     PKG_PWR_LIM_1_EN, 0);
> > > > Most of this code (lock and rmw parts) is already inside static void
> > > > hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?
> > >
> > > This was the case in v1 of the patch:
> > >
> > > https://patchwork.freedesktop.org/patch/526393/?series=115003&rev=1
> > >
> > > But now this cannot be done because if you notice we acquire the mutex in
> > > i915_hwmon_power_max_disable() and release the mutex in
> > > i915_hwmon_power_max_restore().
> > >
> > > I explained the reason why this the mutex is handled this way in my reply
> > > to Jani Nikula here:
> > >
> > > https://patchwork.freedesktop.org/patch/526598/?series=115003&rev=2
> > >
> > > Quoting below:
> > >
> > > ```
> > > > > +	/* hwmon_lock mutex is unlocked in hwm_power_max_restore */
> > > >
> > > > Not too happy about that... any better ideas?
> > >
> > > Afais, taking the mutex is the only fully correct solution (when we disable
> > > the power limit, userspace can go re-enable it). Examples of partly
> > > incorrect solutions (which don't take the mutex) include:
> > >
> > > a. Don't take the mutex, don't do anything, ignore any changes to the value
> > >    if it has changed during GuC reset/fw load (just overwrite the changed
> > >    value). Con: changed value is lost.
> > >
> > > b. Detect if the value has changed (the limit has been re-enabled) after we
> > >    have disabled the limit and in that case skip restoring the value. But
> > >    then someone can say why do we allow enabling the PL1 limit since we
> > >    want to disable it.
> > >
> > > Both these are very unlikely scenarios so they might work. But I would
> > > first like to explore if holding a mutex across GuC reset is prolebmatic
> > > since that is /the/ correct solution. But if anyone comes up with a reason
> > > why that cannot be done we can look at these other not completely correct
> > > options.
> >
> > I see what you are doing and it looks indeed a very safe approach to ensure
> > the pl1 won't be toggled by other paths while we need some guaranteed state
> > here, or hw init fails badly.
> >
> > But in the end you are making your lock to protect the code from another path
> > and not protecting the data itself. The data was already protected in the
> > first version with the lock in the rmw.
> 
> Sorry I am not really following. Daniel had mentioned this "protecting code
> vs protecting data" but I am wondering how it is applicable in this
> case. IMO here the data we are protecting is the register which we don't
> want written to by userland while GuC load is in progress. To do that we
> need to block the code path writing to register. So what we have here seems
> to me to be the simplest and cleanest approach for solving this issue.

I believe your cases here is exactly what Daniel had mentioned as protecting
code and not data. Well, in the end we are of course protecting data to be
modified, but in your case you use that mutex to also protect the code path
and avoid other calls while you are in this guc_init_path...

Please Daniel, correct me here if I got it wrong.

What I don't like here is that we lock from one function and keep that for a
while and unlock from the other function. To protect the data itself in general
we just need for a very minimal time while we are modifying the data itself.

> 
> > maybe we need to have some kind of a state check with other state-lock and
> > then if we are in this forced state for init path, the request for the normal path
> > ignores and move one,
> 
> I don't see how this will *not* be racy...

maybe something like this?:

at power_max_disable:
mutex_lock(data_lock);

mutex_lock(state_lock);
state = in_use;
mutex_unlock(state_lock);

mmio_rmw();
mutex_unlock(data_lock);


at power_max_restoration:

at power_max_disable:
mutex_lock(data_lock);

mutex_lock(state_lock);
state = available;
mutex_unlock(state_lock);

mmio_rmw();
mutex_unlock(data_lock);

at sysfs fn:

mutex_lock(data_lock);
mutex_lock(state_lock);
if (state == in_use) {
   ret = -EAGAIN
   goto out;
}
mutex_unlock(state_lock);

....

out:

mutex_unlock(data_lock);

> 
> > or maybe we queue some request...
> 
> Queuing a request will not be enough (even if this is possible), the
> request will need to wait to complete till GuC load completes. So we'll
> have to complete the request when GuC load completes, similar to releasing
> the mutex in the current patch. Looks like a much more complicated way of
> doing what the mutex does very simply.

The wq would sleep/delay while state == in_use, then process the next request...

> 
> So:
> 
> a. What is the real problem with the current implementation?

probably the big lock used to protect the state machinery...

but if other folks believe that we don't have an actual problem here
and this big lock is acceptable as long as it has the annotation for
the static analyzers, I'm okay to just let it go...


> 
> b. What would be the correct solution for it? That is how, specifically,
>    should we implement it?

state handling with separated lock from the data itself is my suggestion.

> 
> Some more guidance will be helpful if you think this patch has issues.

I hope Daniel and/or other i915 maintainers can jump here. Specially if
I'm being to paranoid and the current patch is enough...

> 
> Thanks.
> --
> Ashutosh
> 
> > > ```
> > >
> > > > > +
> > > > > +	*old = !!(r & PKG_PWR_LIM_1_EN);
> > > > > +}
> > > > > +
> > > > > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
> > > > > +	__releases(i915->hwmon->hwmon_lock)
> > > > We can just call this i915_hwmon_power_max_enable() and call whenever the
> > > > old value was actually enabled. That way, we have proper mirror functions.
> > >
> > > As I explained that would mean adding two checks in the main __uc_init_hw()
> > > function which I am trying to avoid. So we have disable/restore pair.
> > >
> > > > > +{
> > > > > +	struct i915_hwmon *hwmon = i915->hwmon;
> > > > > +	intel_wakeref_t wakeref;
> > > > > +
> > > > > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > > > > +		return;
> > > > > +
> > > > > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > > > > +		intel_uncore_rmw(hwmon->ddat.uncore,
> > > > > +				 hwmon->rg.pkg_rapl_limit,
> > > > > +				 PKG_PWR_LIM_1_EN,
> > > > > +				 old ? PKG_PWR_LIM_1_EN : 0);
> > > >
> > > > 3rd param should be 0 here, else we will end up clearing other bits.
> > >
> > > No see intel_uncore_rmw(), it will only clear the PKG_PWR_LIM_1_EN bit, so
> > > the code here is correct. intel_uncore_rmw() does:
> > >
> > >         val = (old & ~clear) | set;
> > >
> > > So for now I am not making any changes, if you feel strongly about
> > > something one way or another let me know. Anyway these comments should help
> > > you understand the patch better so take a look and we can go from there.
> > >
> > > Thanks.
> > > --
> > > Ashutosh
> > >
> > > > > +
> > > > > +	mutex_unlock(&hwmon->hwmon_lock);
> > > > > +}
> > > > > +
> > > > >   static umode_t
> > > > >   hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
> > > > >   {
> > > > > diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
> > > > > index 7ca9cf2c34c96..0fcb7de844061 100644
> > > > > --- a/drivers/gpu/drm/i915/i915_hwmon.h
> > > > > +++ b/drivers/gpu/drm/i915/i915_hwmon.h
> > > > > @@ -7,14 +7,21 @@
> > > > >   #ifndef __I915_HWMON_H__
> > > > >   #define __I915_HWMON_H__
> > > > >   +#include <linux/types.h>
> > > > > +
> > > > >   struct drm_i915_private;
> > > > > +struct intel_gt;
> > > > >     #if IS_REACHABLE(CONFIG_HWMON)
> > > > >   void i915_hwmon_register(struct drm_i915_private *i915);
> > > > >   void i915_hwmon_unregister(struct drm_i915_private *i915);
> > > > > +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
> > > > > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
> > > > >   #else
> > > > >   static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
> > > > >   static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
> > > > > +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
> > > > > +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
> > > > >   #endif
> > > > >     #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
  2023-03-27 17:47           ` Rodrigo Vivi
  (?)
@ 2023-03-28  9:14           ` Tvrtko Ursulin
  2023-04-06  4:52             ` Dixit, Ashutosh
  -1 siblings, 1 reply; 25+ messages in thread
From: Tvrtko Ursulin @ 2023-03-28  9:14 UTC (permalink / raw)
  To: Rodrigo Vivi, Dixit, Ashutosh, Daniel Vetter
  Cc: Jani Nikula, intel-gfx, dri-devel


On 27/03/2023 18:47, Rodrigo Vivi wrote:
> 
> +Daniel
> 
> On Mon, Mar 27, 2023 at 09:58:52AM -0700, Dixit, Ashutosh wrote:
>> On Sun, 26 Mar 2023 04:52:59 -0700, Rodrigo Vivi wrote:
>>>
>>
>> Hi Rodrigo,
>>
>>> On Fri, Mar 24, 2023 at 04:31:22PM -0700, Dixit, Ashutosh wrote:
>>>> On Fri, 24 Mar 2023 11:15:02 -0700, Belgaumkar, Vinay wrote:
>>>>>
>>>>
>>>> Hi Vinay,
>>>>
>>>> Thanks for the review. Comments inline below.
>>>>
>>>>> On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
>>>>>> On dGfx, the PL1 power limit being enabled and set to a low value results
>>>>>> in a low GPU operating freq. It also negates the freq raise operation which
>>>>>> is done before GuC firmware load. As a result GuC firmware load can time
>>>>>> out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
>>>>>> limit was enabled and set to a low value). Therefore disable the PL1 power
>>>>>> limit when allowed by HW when loading GuC firmware.
>>>>> v3 label missing in subject.
>>>>>>
>>>>>> v2:
>>>>>>    - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
>>>>>>    - Add hwm_power_max_restore to error return code path
>>>>>>
>>>>>> v3 (Jani N):
>>>>>>    - Add/remove explanatory comments
>>>>>>    - Function renames
>>>>>>    - Type corrections
>>>>>>    - Locking annotation
>>>>>>
>>>>>> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
>>>>>> Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
>>>>>> ---
>>>>>>    drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
>>>>>>    drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
>>>>>>    drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
>>>>>>    3 files changed, 55 insertions(+)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>>>>>> index 4ccb4be4c9cba..aa8e35a5636a0 100644
>>>>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>>>>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>>>>>> @@ -18,6 +18,7 @@
>>>>>>    #include "intel_uc.h"
>>>>>>      #include "i915_drv.h"
>>>>>> +#include "i915_hwmon.h"
>>>>>>      static const struct intel_uc_ops uc_ops_off;
>>>>>>    static const struct intel_uc_ops uc_ops_on;
>>>>>> @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
>>>>>> 	struct intel_guc *guc = &uc->guc;
>>>>>> 	struct intel_huc *huc = &uc->huc;
>>>>>> 	int ret, attempts;
>>>>>> +	bool pl1en;
>>>>>
>>>>> Init to 'false' here
>>>>
>>>> See next comment.
>>>>
>>>>>
>>>>>
>>>>>> 		GEM_BUG_ON(!intel_uc_supports_guc(uc));
>>>>>> 	GEM_BUG_ON(!intel_uc_wants_guc(uc));
>>>>>> @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
>>>>>> 	else
>>>>>> 		attempts = 1;
>>>>>>    +	/* Disable a potentially low PL1 power limit to allow freq to be
>>>>>> raised */
>>>>>> +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
>>>>>> +
>>>>>> 	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
>>>>>> 		while (attempts--) {
>>>>>> @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
>>>>>> 		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
>>>>>> 	}
>>>>>>    +	i915_hwmon_power_max_restore(gt->i915, pl1en);
>>>>>> +
>>>>>> 	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
>>>>>> 	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
>>>>>>    @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
>>>>>> 	/* Return GT back to RPn */
>>>>>> 	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
>>>>>>    +	i915_hwmon_power_max_restore(gt->i915, pl1en);
>>>>>
>>>>> if (pl1en)
>>>>>
>>>>>      i915_hwmon_power_max_enable().
>>>>
>>>> IMO it's better not to have checks in the main __uc_init_hw() function (if
>>>> we do this we'll need to add 2 checks in __uc_init_hw()). If you really
>>>> want we could do something like this inside
>>>> i915_hwmon_power_max_disable/i915_hwmon_power_max_restore. But for now I
>>>> am not making any changes.
>>>>
>>>> (I can send a patch with the changes if you want to take a look but IMO it
>>>> will add more logic/code but without real benefits (it will save a rmw if
>>>> the limit was already disabled, but IMO this code is called so infrequently
>>>> (only during GuC resets) as to not have any significant impact)).
>>>>
>>>>>
>>>>>> +
>>>>>> 	__uc_sanitize(uc);
>>>>>> 		if (!ret) {
>>>>>> diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
>>>>>> index ee63a8fd88fc1..769b5bda4d53f 100644
>>>>>> --- a/drivers/gpu/drm/i915/i915_hwmon.c
>>>>>> +++ b/drivers/gpu/drm/i915/i915_hwmon.c
>>>>>> @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
>>>>>> 	}
>>>>>>    }
>>>>>>    +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool
>>>>>> *old)
>>>>> Shouldn't we call this i915_hwmon_package_pl1_disable()?
>>>>
>>>> I did think of using "pl1" in the function name but then decided to retain
>>>> "power_max" because other hwmon functions for PL1 limit also use
>>>> "power_max" (hwm_power_max_read/hwm_power_max_write) and currently
>>>> "hwmon_power_max" is mapped to the PL1 limit. So "power_max" is used to
>>>> show that all these functions deal with the PL1 power limit.
>>>>
>>>> There is a comment in __uc_init_hw() explaining "power_max" means the PL1
>>>> power limit.
>>>>
>>>>>> +	__acquires(i915->hwmon->hwmon_lock)
>>>>>> +{
>>>>>> +	struct i915_hwmon *hwmon = i915->hwmon;
>>>>>> +	intel_wakeref_t wakeref;
>>>>>> +	u32 r;
>>>>>> +
>>>>>> +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
>>>>>> +		return;
>>>>>> +
>>>>>> +	/* Take mutex to prevent concurrent hwm_power_max_write */
>>>>>> +	mutex_lock(&hwmon->hwmon_lock);
>>>>>> +
>>>>>> +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
>>>>>> +		r = intel_uncore_rmw(hwmon->ddat.uncore,
>>>>>> +				     hwmon->rg.pkg_rapl_limit,
>>>>>> +				     PKG_PWR_LIM_1_EN, 0);
>>>>> Most of this code (lock and rmw parts) is already inside static void
>>>>> hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?
>>>>
>>>> This was the case in v1 of the patch:
>>>>
>>>> https://patchwork.freedesktop.org/patch/526393/?series=115003&rev=1
>>>>
>>>> But now this cannot be done because if you notice we acquire the mutex in
>>>> i915_hwmon_power_max_disable() and release the mutex in
>>>> i915_hwmon_power_max_restore().
>>>>
>>>> I explained the reason why this the mutex is handled this way in my reply
>>>> to Jani Nikula here:
>>>>
>>>> https://patchwork.freedesktop.org/patch/526598/?series=115003&rev=2
>>>>
>>>> Quoting below:
>>>>
>>>> ```
>>>>>> +	/* hwmon_lock mutex is unlocked in hwm_power_max_restore */
>>>>>
>>>>> Not too happy about that... any better ideas?
>>>>
>>>> Afais, taking the mutex is the only fully correct solution (when we disable
>>>> the power limit, userspace can go re-enable it). Examples of partly
>>>> incorrect solutions (which don't take the mutex) include:
>>>>
>>>> a. Don't take the mutex, don't do anything, ignore any changes to the value
>>>>     if it has changed during GuC reset/fw load (just overwrite the changed
>>>>     value). Con: changed value is lost.
>>>>
>>>> b. Detect if the value has changed (the limit has been re-enabled) after we
>>>>     have disabled the limit and in that case skip restoring the value. But
>>>>     then someone can say why do we allow enabling the PL1 limit since we
>>>>     want to disable it.
>>>>
>>>> Both these are very unlikely scenarios so they might work. But I would
>>>> first like to explore if holding a mutex across GuC reset is prolebmatic
>>>> since that is /the/ correct solution. But if anyone comes up with a reason
>>>> why that cannot be done we can look at these other not completely correct
>>>> options.
>>>
>>> I see what you are doing and it looks indeed a very safe approach to ensure
>>> the pl1 won't be toggled by other paths while we need some guaranteed state
>>> here, or hw init fails badly.
>>>
>>> But in the end you are making your lock to protect the code from another path
>>> and not protecting the data itself. The data was already protected in the
>>> first version with the lock in the rmw.
>>
>> Sorry I am not really following. Daniel had mentioned this "protecting code
>> vs protecting data" but I am wondering how it is applicable in this
>> case. IMO here the data we are protecting is the register which we don't
>> want written to by userland while GuC load is in progress. To do that we
>> need to block the code path writing to register. So what we have here seems
>> to me to be the simplest and cleanest approach for solving this issue.
> 
> I believe your cases here is exactly what Daniel had mentioned as protecting
> code and not data. Well, in the end we are of course protecting data to be
> modified, but in your case you use that mutex to also protect the code path
> and avoid other calls while you are in this guc_init_path...
> 
> Please Daniel, correct me here if I got it wrong.
> 
> What I don't like here is that we lock from one function and keep that for a
> while and unlock from the other function. To protect the data itself in general
> we just need for a very minimal time while we are modifying the data itself.
> 
>>
>>> maybe we need to have some kind of a state check with other state-lock and
>>> then if we are in this forced state for init path, the request for the normal path
>>> ignores and move one,
>>
>> I don't see how this will *not* be racy...
> 
> maybe something like this?:
> 
> at power_max_disable:
> mutex_lock(data_lock);
> 
> mutex_lock(state_lock);
> state = in_use;
> mutex_unlock(state_lock);
> 
> mmio_rmw();
> mutex_unlock(data_lock);
> 
> 
> at power_max_restoration:
> 
> at power_max_disable:
> mutex_lock(data_lock);
> 
> mutex_lock(state_lock);
> state = available;
> mutex_unlock(state_lock);
> 
> mmio_rmw();
> mutex_unlock(data_lock);
> 
> at sysfs fn:
> 
> mutex_lock(data_lock);
> mutex_lock(state_lock);
> if (state == in_use) {
>     ret = -EAGAIN
>     goto out;
> }
> mutex_unlock(state_lock);
> 
> ....
> 
> out:
> 
> mutex_unlock(data_lock);

I agree holding the mutex across functions to cover the GuC init path is 
not the nicest pattern. Above looks a plausible improvement, although I 
don't know if EAGAIN is correct for hwmon, or if blocking is. Is 
something expected to be configuring those fields during boot and can it 
even handle EAGAIN?

One advantage of the solution from this patch I can see though is that I 
think it eliminates data races (restoring the stale value) with fw 
reload triggered by a potential full GPU reset happening in parallel to 
sysfs writes.

Another thing to check would be if the inversions between 
hwmon_lock->rpm_get and rpm_get->hwmon_lock are okay.

In fact, I am not sure rpm_get in this patch is needed? Seems to be 
running under paths which guarantee holding it already, if I am not 
missing something. If not needed then there is obviously no inversion in 
any way.

Regards,

Tvrtko

P.S.
Do some of the exiting mutex_lock need actually be 
mutex_lock_interruptible so sysfs reads/write can Ctrl-C, in theory at 
least.


>>> or maybe we queue some request...
>>
>> Queuing a request will not be enough (even if this is possible), the
>> request will need to wait to complete till GuC load completes. So we'll
>> have to complete the request when GuC load completes, similar to releasing
>> the mutex in the current patch. Looks like a much more complicated way of
>> doing what the mutex does very simply.
> 
> The wq would sleep/delay while state == in_use, then process the next request...
> 
>>
>> So:
>>
>> a. What is the real problem with the current implementation?
> 
> probably the big lock used to protect the state machinery...
> 
> but if other folks believe that we don't have an actual problem here
> and this big lock is acceptable as long as it has the annotation for
> the static analyzers, I'm okay to just let it go...
> 
> 
>>
>> b. What would be the correct solution for it? That is how, specifically,
>>     should we implement it?
> 
> state handling with separated lock from the data itself is my suggestion.
> 
>>
>> Some more guidance will be helpful if you think this patch has issues.
> 
> I hope Daniel and/or other i915 maintainers can jump here. Specially if
> I'm being to paranoid and the current patch is enough...
> 
>>
>> Thanks.
>> --
>> Ashutosh
>>
>>>> ```
>>>>
>>>>>> +
>>>>>> +	*old = !!(r & PKG_PWR_LIM_1_EN);
>>>>>> +}
>>>>>> +
>>>>>> +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
>>>>>> +	__releases(i915->hwmon->hwmon_lock)
>>>>> We can just call this i915_hwmon_power_max_enable() and call whenever the
>>>>> old value was actually enabled. That way, we have proper mirror functions.
>>>>
>>>> As I explained that would mean adding two checks in the main __uc_init_hw()
>>>> function which I am trying to avoid. So we have disable/restore pair.
>>>>
>>>>>> +{
>>>>>> +	struct i915_hwmon *hwmon = i915->hwmon;
>>>>>> +	intel_wakeref_t wakeref;
>>>>>> +
>>>>>> +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
>>>>>> +		return;
>>>>>> +
>>>>>> +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
>>>>>> +		intel_uncore_rmw(hwmon->ddat.uncore,
>>>>>> +				 hwmon->rg.pkg_rapl_limit,
>>>>>> +				 PKG_PWR_LIM_1_EN,
>>>>>> +				 old ? PKG_PWR_LIM_1_EN : 0);
>>>>>
>>>>> 3rd param should be 0 here, else we will end up clearing other bits.
>>>>
>>>> No see intel_uncore_rmw(), it will only clear the PKG_PWR_LIM_1_EN bit, so
>>>> the code here is correct. intel_uncore_rmw() does:
>>>>
>>>>          val = (old & ~clear) | set;
>>>>
>>>> So for now I am not making any changes, if you feel strongly about
>>>> something one way or another let me know. Anyway these comments should help
>>>> you understand the patch better so take a look and we can go from there.
>>>>
>>>> Thanks.
>>>> --
>>>> Ashutosh
>>>>
>>>>>> +
>>>>>> +	mutex_unlock(&hwmon->hwmon_lock);
>>>>>> +}
>>>>>> +
>>>>>>    static umode_t
>>>>>>    hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
>>>>>>    {
>>>>>> diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
>>>>>> index 7ca9cf2c34c96..0fcb7de844061 100644
>>>>>> --- a/drivers/gpu/drm/i915/i915_hwmon.h
>>>>>> +++ b/drivers/gpu/drm/i915/i915_hwmon.h
>>>>>> @@ -7,14 +7,21 @@
>>>>>>    #ifndef __I915_HWMON_H__
>>>>>>    #define __I915_HWMON_H__
>>>>>>    +#include <linux/types.h>
>>>>>> +
>>>>>>    struct drm_i915_private;
>>>>>> +struct intel_gt;
>>>>>>      #if IS_REACHABLE(CONFIG_HWMON)
>>>>>>    void i915_hwmon_register(struct drm_i915_private *i915);
>>>>>>    void i915_hwmon_unregister(struct drm_i915_private *i915);
>>>>>> +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
>>>>>> +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
>>>>>>    #else
>>>>>>    static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
>>>>>>    static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
>>>>>> +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
>>>>>> +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
>>>>>>    #endif
>>>>>>      #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
  2023-03-27 17:47           ` Rodrigo Vivi
@ 2023-04-06  4:50             ` Dixit, Ashutosh
  -1 siblings, 0 replies; 25+ messages in thread
From: Dixit, Ashutosh @ 2023-04-06  4:50 UTC (permalink / raw)
  To: Rodrigo Vivi; +Cc: Tvrtko Ursulin, intel-gfx, dri-devel

On Mon, 27 Mar 2023 10:47:25 -0700, Rodrigo Vivi wrote:
>

Hi Rodrigo,

Sorry for the delay, I got pulled away into a couple of other things and
could only now get back to this.

>
> +Daniel
>
> On Mon, Mar 27, 2023 at 09:58:52AM -0700, Dixit, Ashutosh wrote:
> > On Sun, 26 Mar 2023 04:52:59 -0700, Rodrigo Vivi wrote:
> > >
> >
> > Hi Rodrigo,
> >
> > > On Fri, Mar 24, 2023 at 04:31:22PM -0700, Dixit, Ashutosh wrote:
> > > > On Fri, 24 Mar 2023 11:15:02 -0700, Belgaumkar, Vinay wrote:
> > > > >
> > > >
> > > > Hi Vinay,
> > > >
> > > > Thanks for the review. Comments inline below.
> > > >
> > > > > On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
> > > > > > On dGfx, the PL1 power limit being enabled and set to a low value results
> > > > > > in a low GPU operating freq. It also negates the freq raise operation which
> > > > > > is done before GuC firmware load. As a result GuC firmware load can time
> > > > > > out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
> > > > > > limit was enabled and set to a low value). Therefore disable the PL1 power
> > > > > > limit when allowed by HW when loading GuC firmware.
> > > > > v3 label missing in subject.
> > > > > >
> > > > > > v2:
> > > > > >   - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
> > > > > >   - Add hwm_power_max_restore to error return code path
> > > > > >
> > > > > > v3 (Jani N):
> > > > > >   - Add/remove explanatory comments
> > > > > >   - Function renames
> > > > > >   - Type corrections
> > > > > >   - Locking annotation
> > > > > >
> > > > > > Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
> > > > > > Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> > > > > > ---
> > > > > >   drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
> > > > > >   drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
> > > > > >   drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
> > > > > >   3 files changed, 55 insertions(+)
> > > > > >
> > > > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > > > index 4ccb4be4c9cba..aa8e35a5636a0 100644
> > > > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > > > @@ -18,6 +18,7 @@
> > > > > >   #include "intel_uc.h"
> > > > > >     #include "i915_drv.h"
> > > > > > +#include "i915_hwmon.h"
> > > > > >     static const struct intel_uc_ops uc_ops_off;
> > > > > >   static const struct intel_uc_ops uc_ops_on;
> > > > > > @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > > > >	struct intel_guc *guc = &uc->guc;
> > > > > >	struct intel_huc *huc = &uc->huc;
> > > > > >	int ret, attempts;
> > > > > > +	bool pl1en;
> > > > >
> > > > > Init to 'false' here
> > > >
> > > > See next comment.
> > > >
> > > > >
> > > > >
> > > > > >		GEM_BUG_ON(!intel_uc_supports_guc(uc));
> > > > > >	GEM_BUG_ON(!intel_uc_wants_guc(uc));
> > > > > > @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > > > >	else
> > > > > >		attempts = 1;
> > > > > >   +	/* Disable a potentially low PL1 power limit to allow freq to be
> > > > > > raised */
> > > > > > +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
> > > > > > +
> > > > > >	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
> > > > > >		while (attempts--) {
> > > > > > @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > > > >		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> > > > > >	}
> > > > > >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> > > > > > +
> > > > > >	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
> > > > > >	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
> > > > > >   @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > > > >	/* Return GT back to RPn */
> > > > > >	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> > > > > >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> > > > >
> > > > > if (pl1en)
> > > > >
> > > > >     i915_hwmon_power_max_enable().
> > > >
> > > > IMO it's better not to have checks in the main __uc_init_hw() function (if
> > > > we do this we'll need to add 2 checks in __uc_init_hw()). If you really
> > > > want we could do something like this inside
> > > > i915_hwmon_power_max_disable/i915_hwmon_power_max_restore. But for now I
> > > > am not making any changes.
> > > >
> > > > (I can send a patch with the changes if you want to take a look but IMO it
> > > > will add more logic/code but without real benefits (it will save a rmw if
> > > > the limit was already disabled, but IMO this code is called so infrequently
> > > > (only during GuC resets) as to not have any significant impact)).
> > > >
> > > > >
> > > > > > +
> > > > > >	__uc_sanitize(uc);
> > > > > >		if (!ret) {
> > > > > > diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> > > > > > index ee63a8fd88fc1..769b5bda4d53f 100644
> > > > > > --- a/drivers/gpu/drm/i915/i915_hwmon.c
> > > > > > +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> > > > > > @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
> > > > > >	}
> > > > > >   }
> > > > > >   +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool
> > > > > > *old)
> > > > > Shouldn't we call this i915_hwmon_package_pl1_disable()?
> > > >
> > > > I did think of using "pl1" in the function name but then decided to retain
> > > > "power_max" because other hwmon functions for PL1 limit also use
> > > > "power_max" (hwm_power_max_read/hwm_power_max_write) and currently
> > > > "hwmon_power_max" is mapped to the PL1 limit. So "power_max" is used to
> > > > show that all these functions deal with the PL1 power limit.
> > > >
> > > > There is a comment in __uc_init_hw() explaining "power_max" means the PL1
> > > > power limit.
> > > >
> > > > > > +	__acquires(i915->hwmon->hwmon_lock)
> > > > > > +{
> > > > > > +	struct i915_hwmon *hwmon = i915->hwmon;
> > > > > > +	intel_wakeref_t wakeref;
> > > > > > +	u32 r;
> > > > > > +
> > > > > > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > > > > > +		return;
> > > > > > +
> > > > > > +	/* Take mutex to prevent concurrent hwm_power_max_write */
> > > > > > +	mutex_lock(&hwmon->hwmon_lock);
> > > > > > +
> > > > > > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > > > > > +		r = intel_uncore_rmw(hwmon->ddat.uncore,
> > > > > > +				     hwmon->rg.pkg_rapl_limit,
> > > > > > +				     PKG_PWR_LIM_1_EN, 0);
> > > > > Most of this code (lock and rmw parts) is already inside static void
> > > > > hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?
> > > >
> > > > This was the case in v1 of the patch:
> > > >
> > > > https://patchwork.freedesktop.org/patch/526393/?series=115003&rev=1
> > > >
> > > > But now this cannot be done because if you notice we acquire the mutex in
> > > > i915_hwmon_power_max_disable() and release the mutex in
> > > > i915_hwmon_power_max_restore().
> > > >
> > > > I explained the reason why this the mutex is handled this way in my reply
> > > > to Jani Nikula here:
> > > >
> > > > https://patchwork.freedesktop.org/patch/526598/?series=115003&rev=2
> > > >
> > > > Quoting below:
> > > >
> > > > ```
> > > > > > +	/* hwmon_lock mutex is unlocked in hwm_power_max_restore */
> > > > >
> > > > > Not too happy about that... any better ideas?
> > > >
> > > > Afais, taking the mutex is the only fully correct solution (when we disable
> > > > the power limit, userspace can go re-enable it). Examples of partly
> > > > incorrect solutions (which don't take the mutex) include:
> > > >
> > > > a. Don't take the mutex, don't do anything, ignore any changes to the value
> > > >    if it has changed during GuC reset/fw load (just overwrite the changed
> > > >    value). Con: changed value is lost.
> > > >
> > > > b. Detect if the value has changed (the limit has been re-enabled) after we
> > > >    have disabled the limit and in that case skip restoring the value. But
> > > >    then someone can say why do we allow enabling the PL1 limit since we
> > > >    want to disable it.
> > > >
> > > > Both these are very unlikely scenarios so they might work. But I would
> > > > first like to explore if holding a mutex across GuC reset is prolebmatic
> > > > since that is /the/ correct solution. But if anyone comes up with a reason
> > > > why that cannot be done we can look at these other not completely correct
> > > > options.
> > >
> > > I see what you are doing and it looks indeed a very safe approach to ensure
> > > the pl1 won't be toggled by other paths while we need some guaranteed state
> > > here, or hw init fails badly.
> > >
> > > But in the end you are making your lock to protect the code from another path
> > > and not protecting the data itself. The data was already protected in the
> > > first version with the lock in the rmw.
> >
> > Sorry I am not really following. Daniel had mentioned this "protecting code
> > vs protecting data" but I am wondering how it is applicable in this
> > case. IMO here the data we are protecting is the register which we don't
> > want written to by userland while GuC load is in progress. To do that we
> > need to block the code path writing to register. So what we have here seems
> > to me to be the simplest and cleanest approach for solving this issue.
>
> I believe your cases here is exactly what Daniel had mentioned as protecting
> code and not data. Well, in the end we are of course protecting data to be
> modified, but in your case you use that mutex to also protect the code path
> and avoid other calls while you are in this guc_init_path...
>
> Please Daniel, correct me here if I got it wrong.
>
> What I don't like here is that we lock from one function and keep that for a
> while and unlock from the other function. To protect the data itself in general
> we just need for a very minimal time while we are modifying the data itself.
>
> >
> > > maybe we need to have some kind of a state check with other state-lock and
> > > then if we are in this forced state for init path, the request for the normal path
> > > ignores and move one,
> >
> > I don't see how this will *not* be racy...
>
> maybe something like this?:
>
> at power_max_disable:
> mutex_lock(data_lock);
>
> mutex_lock(state_lock);
> state = in_use;
> mutex_unlock(state_lock);
>
> mmio_rmw();
> mutex_unlock(data_lock);
>
>
> at power_max_restoration:
>
> at power_max_disable:
> mutex_lock(data_lock);
>
> mutex_lock(state_lock);
> state = available;
> mutex_unlock(state_lock);
>
> mmio_rmw();
> mutex_unlock(data_lock);
>
> at sysfs fn:
>
> mutex_lock(data_lock);
> mutex_lock(state_lock);
> if (state == in_use) {
>    ret = -EAGAIN
>    goto out;
> }
> mutex_unlock(state_lock);
>
> ....
>
> out:
>
> mutex_unlock(data_lock);

Thanks for suggesting this, actually it worked out quite nicely and I have
implemented this in the latest version.

Though I believe state_lock is not needed (data_lock is sufficient) so I
have skipped that. Please take a look at:

https://patchwork.freedesktop.org/series/116172/

Thanks.
--
Ashutosh


>
> >
> > > or maybe we queue some request...
> >
> > Queuing a request will not be enough (even if this is possible), the
> > request will need to wait to complete till GuC load completes. So we'll
> > have to complete the request when GuC load completes, similar to releasing
> > the mutex in the current patch. Looks like a much more complicated way of
> > doing what the mutex does very simply.
>
> The wq would sleep/delay while state == in_use, then process the next request...
>
> >
> > So:
> >
> > a. What is the real problem with the current implementation?
>
> probably the big lock used to protect the state machinery...
>
> but if other folks believe that we don't have an actual problem here
> and this big lock is acceptable as long as it has the annotation for
> the static analyzers, I'm okay to just let it go...
>
>
> >
> > b. What would be the correct solution for it? That is how, specifically,
> >    should we implement it?
>
> state handling with separated lock from the data itself is my suggestion.
>
> >
> > Some more guidance will be helpful if you think this patch has issues.
>
> I hope Daniel and/or other i915 maintainers can jump here. Specially if
> I'm being to paranoid and the current patch is enough...
>
> >
> > Thanks.
> > --
> > Ashutosh
> >
> > > > ```
> > > >
> > > > > > +
> > > > > > +	*old = !!(r & PKG_PWR_LIM_1_EN);
> > > > > > +}
> > > > > > +
> > > > > > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
> > > > > > +	__releases(i915->hwmon->hwmon_lock)
> > > > > We can just call this i915_hwmon_power_max_enable() and call whenever the
> > > > > old value was actually enabled. That way, we have proper mirror functions.
> > > >
> > > > As I explained that would mean adding two checks in the main __uc_init_hw()
> > > > function which I am trying to avoid. So we have disable/restore pair.
> > > >
> > > > > > +{
> > > > > > +	struct i915_hwmon *hwmon = i915->hwmon;
> > > > > > +	intel_wakeref_t wakeref;
> > > > > > +
> > > > > > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > > > > > +		return;
> > > > > > +
> > > > > > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > > > > > +		intel_uncore_rmw(hwmon->ddat.uncore,
> > > > > > +				 hwmon->rg.pkg_rapl_limit,
> > > > > > +				 PKG_PWR_LIM_1_EN,
> > > > > > +				 old ? PKG_PWR_LIM_1_EN : 0);
> > > > >
> > > > > 3rd param should be 0 here, else we will end up clearing other bits.
> > > >
> > > > No see intel_uncore_rmw(), it will only clear the PKG_PWR_LIM_1_EN bit, so
> > > > the code here is correct. intel_uncore_rmw() does:
> > > >
> > > >         val = (old & ~clear) | set;
> > > >
> > > > So for now I am not making any changes, if you feel strongly about
> > > > something one way or another let me know. Anyway these comments should help
> > > > you understand the patch better so take a look and we can go from there.
> > > >
> > > > Thanks.
> > > > --
> > > > Ashutosh
> > > >
> > > > > > +
> > > > > > +	mutex_unlock(&hwmon->hwmon_lock);
> > > > > > +}
> > > > > > +
> > > > > >   static umode_t
> > > > > >   hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
> > > > > >   {
> > > > > > diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
> > > > > > index 7ca9cf2c34c96..0fcb7de844061 100644
> > > > > > --- a/drivers/gpu/drm/i915/i915_hwmon.h
> > > > > > +++ b/drivers/gpu/drm/i915/i915_hwmon.h
> > > > > > @@ -7,14 +7,21 @@
> > > > > >   #ifndef __I915_HWMON_H__
> > > > > >   #define __I915_HWMON_H__
> > > > > >   +#include <linux/types.h>
> > > > > > +
> > > > > >   struct drm_i915_private;
> > > > > > +struct intel_gt;
> > > > > >     #if IS_REACHABLE(CONFIG_HWMON)
> > > > > >   void i915_hwmon_register(struct drm_i915_private *i915);
> > > > > >   void i915_hwmon_unregister(struct drm_i915_private *i915);
> > > > > > +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
> > > > > > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
> > > > > >   #else
> > > > > >   static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
> > > > > >   static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
> > > > > > +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
> > > > > > +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
> > > > > >   #endif
> > > > > >     #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
@ 2023-04-06  4:50             ` Dixit, Ashutosh
  0 siblings, 0 replies; 25+ messages in thread
From: Dixit, Ashutosh @ 2023-04-06  4:50 UTC (permalink / raw)
  To: Rodrigo Vivi; +Cc: intel-gfx, dri-devel

On Mon, 27 Mar 2023 10:47:25 -0700, Rodrigo Vivi wrote:
>

Hi Rodrigo,

Sorry for the delay, I got pulled away into a couple of other things and
could only now get back to this.

>
> +Daniel
>
> On Mon, Mar 27, 2023 at 09:58:52AM -0700, Dixit, Ashutosh wrote:
> > On Sun, 26 Mar 2023 04:52:59 -0700, Rodrigo Vivi wrote:
> > >
> >
> > Hi Rodrigo,
> >
> > > On Fri, Mar 24, 2023 at 04:31:22PM -0700, Dixit, Ashutosh wrote:
> > > > On Fri, 24 Mar 2023 11:15:02 -0700, Belgaumkar, Vinay wrote:
> > > > >
> > > >
> > > > Hi Vinay,
> > > >
> > > > Thanks for the review. Comments inline below.
> > > >
> > > > > On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
> > > > > > On dGfx, the PL1 power limit being enabled and set to a low value results
> > > > > > in a low GPU operating freq. It also negates the freq raise operation which
> > > > > > is done before GuC firmware load. As a result GuC firmware load can time
> > > > > > out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
> > > > > > limit was enabled and set to a low value). Therefore disable the PL1 power
> > > > > > limit when allowed by HW when loading GuC firmware.
> > > > > v3 label missing in subject.
> > > > > >
> > > > > > v2:
> > > > > >   - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
> > > > > >   - Add hwm_power_max_restore to error return code path
> > > > > >
> > > > > > v3 (Jani N):
> > > > > >   - Add/remove explanatory comments
> > > > > >   - Function renames
> > > > > >   - Type corrections
> > > > > >   - Locking annotation
> > > > > >
> > > > > > Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
> > > > > > Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> > > > > > ---
> > > > > >   drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
> > > > > >   drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
> > > > > >   drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
> > > > > >   3 files changed, 55 insertions(+)
> > > > > >
> > > > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > > > index 4ccb4be4c9cba..aa8e35a5636a0 100644
> > > > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > > > > > @@ -18,6 +18,7 @@
> > > > > >   #include "intel_uc.h"
> > > > > >     #include "i915_drv.h"
> > > > > > +#include "i915_hwmon.h"
> > > > > >     static const struct intel_uc_ops uc_ops_off;
> > > > > >   static const struct intel_uc_ops uc_ops_on;
> > > > > > @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > > > >	struct intel_guc *guc = &uc->guc;
> > > > > >	struct intel_huc *huc = &uc->huc;
> > > > > >	int ret, attempts;
> > > > > > +	bool pl1en;
> > > > >
> > > > > Init to 'false' here
> > > >
> > > > See next comment.
> > > >
> > > > >
> > > > >
> > > > > >		GEM_BUG_ON(!intel_uc_supports_guc(uc));
> > > > > >	GEM_BUG_ON(!intel_uc_wants_guc(uc));
> > > > > > @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > > > >	else
> > > > > >		attempts = 1;
> > > > > >   +	/* Disable a potentially low PL1 power limit to allow freq to be
> > > > > > raised */
> > > > > > +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
> > > > > > +
> > > > > >	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
> > > > > >		while (attempts--) {
> > > > > > @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > > > >		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> > > > > >	}
> > > > > >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> > > > > > +
> > > > > >	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
> > > > > >	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
> > > > > >   @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> > > > > >	/* Return GT back to RPn */
> > > > > >	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> > > > > >   +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> > > > >
> > > > > if (pl1en)
> > > > >
> > > > >     i915_hwmon_power_max_enable().
> > > >
> > > > IMO it's better not to have checks in the main __uc_init_hw() function (if
> > > > we do this we'll need to add 2 checks in __uc_init_hw()). If you really
> > > > want we could do something like this inside
> > > > i915_hwmon_power_max_disable/i915_hwmon_power_max_restore. But for now I
> > > > am not making any changes.
> > > >
> > > > (I can send a patch with the changes if you want to take a look but IMO it
> > > > will add more logic/code but without real benefits (it will save a rmw if
> > > > the limit was already disabled, but IMO this code is called so infrequently
> > > > (only during GuC resets) as to not have any significant impact)).
> > > >
> > > > >
> > > > > > +
> > > > > >	__uc_sanitize(uc);
> > > > > >		if (!ret) {
> > > > > > diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> > > > > > index ee63a8fd88fc1..769b5bda4d53f 100644
> > > > > > --- a/drivers/gpu/drm/i915/i915_hwmon.c
> > > > > > +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> > > > > > @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
> > > > > >	}
> > > > > >   }
> > > > > >   +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool
> > > > > > *old)
> > > > > Shouldn't we call this i915_hwmon_package_pl1_disable()?
> > > >
> > > > I did think of using "pl1" in the function name but then decided to retain
> > > > "power_max" because other hwmon functions for PL1 limit also use
> > > > "power_max" (hwm_power_max_read/hwm_power_max_write) and currently
> > > > "hwmon_power_max" is mapped to the PL1 limit. So "power_max" is used to
> > > > show that all these functions deal with the PL1 power limit.
> > > >
> > > > There is a comment in __uc_init_hw() explaining "power_max" means the PL1
> > > > power limit.
> > > >
> > > > > > +	__acquires(i915->hwmon->hwmon_lock)
> > > > > > +{
> > > > > > +	struct i915_hwmon *hwmon = i915->hwmon;
> > > > > > +	intel_wakeref_t wakeref;
> > > > > > +	u32 r;
> > > > > > +
> > > > > > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > > > > > +		return;
> > > > > > +
> > > > > > +	/* Take mutex to prevent concurrent hwm_power_max_write */
> > > > > > +	mutex_lock(&hwmon->hwmon_lock);
> > > > > > +
> > > > > > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > > > > > +		r = intel_uncore_rmw(hwmon->ddat.uncore,
> > > > > > +				     hwmon->rg.pkg_rapl_limit,
> > > > > > +				     PKG_PWR_LIM_1_EN, 0);
> > > > > Most of this code (lock and rmw parts) is already inside static void
> > > > > hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?
> > > >
> > > > This was the case in v1 of the patch:
> > > >
> > > > https://patchwork.freedesktop.org/patch/526393/?series=115003&rev=1
> > > >
> > > > But now this cannot be done because if you notice we acquire the mutex in
> > > > i915_hwmon_power_max_disable() and release the mutex in
> > > > i915_hwmon_power_max_restore().
> > > >
> > > > I explained the reason why this the mutex is handled this way in my reply
> > > > to Jani Nikula here:
> > > >
> > > > https://patchwork.freedesktop.org/patch/526598/?series=115003&rev=2
> > > >
> > > > Quoting below:
> > > >
> > > > ```
> > > > > > +	/* hwmon_lock mutex is unlocked in hwm_power_max_restore */
> > > > >
> > > > > Not too happy about that... any better ideas?
> > > >
> > > > Afais, taking the mutex is the only fully correct solution (when we disable
> > > > the power limit, userspace can go re-enable it). Examples of partly
> > > > incorrect solutions (which don't take the mutex) include:
> > > >
> > > > a. Don't take the mutex, don't do anything, ignore any changes to the value
> > > >    if it has changed during GuC reset/fw load (just overwrite the changed
> > > >    value). Con: changed value is lost.
> > > >
> > > > b. Detect if the value has changed (the limit has been re-enabled) after we
> > > >    have disabled the limit and in that case skip restoring the value. But
> > > >    then someone can say why do we allow enabling the PL1 limit since we
> > > >    want to disable it.
> > > >
> > > > Both these are very unlikely scenarios so they might work. But I would
> > > > first like to explore if holding a mutex across GuC reset is prolebmatic
> > > > since that is /the/ correct solution. But if anyone comes up with a reason
> > > > why that cannot be done we can look at these other not completely correct
> > > > options.
> > >
> > > I see what you are doing and it looks indeed a very safe approach to ensure
> > > the pl1 won't be toggled by other paths while we need some guaranteed state
> > > here, or hw init fails badly.
> > >
> > > But in the end you are making your lock to protect the code from another path
> > > and not protecting the data itself. The data was already protected in the
> > > first version with the lock in the rmw.
> >
> > Sorry I am not really following. Daniel had mentioned this "protecting code
> > vs protecting data" but I am wondering how it is applicable in this
> > case. IMO here the data we are protecting is the register which we don't
> > want written to by userland while GuC load is in progress. To do that we
> > need to block the code path writing to register. So what we have here seems
> > to me to be the simplest and cleanest approach for solving this issue.
>
> I believe your cases here is exactly what Daniel had mentioned as protecting
> code and not data. Well, in the end we are of course protecting data to be
> modified, but in your case you use that mutex to also protect the code path
> and avoid other calls while you are in this guc_init_path...
>
> Please Daniel, correct me here if I got it wrong.
>
> What I don't like here is that we lock from one function and keep that for a
> while and unlock from the other function. To protect the data itself in general
> we just need for a very minimal time while we are modifying the data itself.
>
> >
> > > maybe we need to have some kind of a state check with other state-lock and
> > > then if we are in this forced state for init path, the request for the normal path
> > > ignores and move one,
> >
> > I don't see how this will *not* be racy...
>
> maybe something like this?:
>
> at power_max_disable:
> mutex_lock(data_lock);
>
> mutex_lock(state_lock);
> state = in_use;
> mutex_unlock(state_lock);
>
> mmio_rmw();
> mutex_unlock(data_lock);
>
>
> at power_max_restoration:
>
> at power_max_disable:
> mutex_lock(data_lock);
>
> mutex_lock(state_lock);
> state = available;
> mutex_unlock(state_lock);
>
> mmio_rmw();
> mutex_unlock(data_lock);
>
> at sysfs fn:
>
> mutex_lock(data_lock);
> mutex_lock(state_lock);
> if (state == in_use) {
>    ret = -EAGAIN
>    goto out;
> }
> mutex_unlock(state_lock);
>
> ....
>
> out:
>
> mutex_unlock(data_lock);

Thanks for suggesting this, actually it worked out quite nicely and I have
implemented this in the latest version.

Though I believe state_lock is not needed (data_lock is sufficient) so I
have skipped that. Please take a look at:

https://patchwork.freedesktop.org/series/116172/

Thanks.
--
Ashutosh


>
> >
> > > or maybe we queue some request...
> >
> > Queuing a request will not be enough (even if this is possible), the
> > request will need to wait to complete till GuC load completes. So we'll
> > have to complete the request when GuC load completes, similar to releasing
> > the mutex in the current patch. Looks like a much more complicated way of
> > doing what the mutex does very simply.
>
> The wq would sleep/delay while state == in_use, then process the next request...
>
> >
> > So:
> >
> > a. What is the real problem with the current implementation?
>
> probably the big lock used to protect the state machinery...
>
> but if other folks believe that we don't have an actual problem here
> and this big lock is acceptable as long as it has the annotation for
> the static analyzers, I'm okay to just let it go...
>
>
> >
> > b. What would be the correct solution for it? That is how, specifically,
> >    should we implement it?
>
> state handling with separated lock from the data itself is my suggestion.
>
> >
> > Some more guidance will be helpful if you think this patch has issues.
>
> I hope Daniel and/or other i915 maintainers can jump here. Specially if
> I'm being to paranoid and the current patch is enough...
>
> >
> > Thanks.
> > --
> > Ashutosh
> >
> > > > ```
> > > >
> > > > > > +
> > > > > > +	*old = !!(r & PKG_PWR_LIM_1_EN);
> > > > > > +}
> > > > > > +
> > > > > > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
> > > > > > +	__releases(i915->hwmon->hwmon_lock)
> > > > > We can just call this i915_hwmon_power_max_enable() and call whenever the
> > > > > old value was actually enabled. That way, we have proper mirror functions.
> > > >
> > > > As I explained that would mean adding two checks in the main __uc_init_hw()
> > > > function which I am trying to avoid. So we have disable/restore pair.
> > > >
> > > > > > +{
> > > > > > +	struct i915_hwmon *hwmon = i915->hwmon;
> > > > > > +	intel_wakeref_t wakeref;
> > > > > > +
> > > > > > +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> > > > > > +		return;
> > > > > > +
> > > > > > +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> > > > > > +		intel_uncore_rmw(hwmon->ddat.uncore,
> > > > > > +				 hwmon->rg.pkg_rapl_limit,
> > > > > > +				 PKG_PWR_LIM_1_EN,
> > > > > > +				 old ? PKG_PWR_LIM_1_EN : 0);
> > > > >
> > > > > 3rd param should be 0 here, else we will end up clearing other bits.
> > > >
> > > > No see intel_uncore_rmw(), it will only clear the PKG_PWR_LIM_1_EN bit, so
> > > > the code here is correct. intel_uncore_rmw() does:
> > > >
> > > >         val = (old & ~clear) | set;
> > > >
> > > > So for now I am not making any changes, if you feel strongly about
> > > > something one way or another let me know. Anyway these comments should help
> > > > you understand the patch better so take a look and we can go from there.
> > > >
> > > > Thanks.
> > > > --
> > > > Ashutosh
> > > >
> > > > > > +
> > > > > > +	mutex_unlock(&hwmon->hwmon_lock);
> > > > > > +}
> > > > > > +
> > > > > >   static umode_t
> > > > > >   hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
> > > > > >   {
> > > > > > diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
> > > > > > index 7ca9cf2c34c96..0fcb7de844061 100644
> > > > > > --- a/drivers/gpu/drm/i915/i915_hwmon.h
> > > > > > +++ b/drivers/gpu/drm/i915/i915_hwmon.h
> > > > > > @@ -7,14 +7,21 @@
> > > > > >   #ifndef __I915_HWMON_H__
> > > > > >   #define __I915_HWMON_H__
> > > > > >   +#include <linux/types.h>
> > > > > > +
> > > > > >   struct drm_i915_private;
> > > > > > +struct intel_gt;
> > > > > >     #if IS_REACHABLE(CONFIG_HWMON)
> > > > > >   void i915_hwmon_register(struct drm_i915_private *i915);
> > > > > >   void i915_hwmon_unregister(struct drm_i915_private *i915);
> > > > > > +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
> > > > > > +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
> > > > > >   #else
> > > > > >   static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
> > > > > >   static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
> > > > > > +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
> > > > > > +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
> > > > > >   #endif
> > > > > >     #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
  2023-03-28  9:14           ` Tvrtko Ursulin
@ 2023-04-06  4:52             ` Dixit, Ashutosh
  0 siblings, 0 replies; 25+ messages in thread
From: Dixit, Ashutosh @ 2023-04-06  4:52 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx, dri-devel, Rodrigo Vivi

On Tue, 28 Mar 2023 02:14:42 -0700, Tvrtko Ursulin wrote:
>

Hi Tvrtko,

> On 27/03/2023 18:47, Rodrigo Vivi wrote:
> >
> > +Daniel
> >
> > On Mon, Mar 27, 2023 at 09:58:52AM -0700, Dixit, Ashutosh wrote:
> >> On Sun, 26 Mar 2023 04:52:59 -0700, Rodrigo Vivi wrote:
> >>>
> >>
> >> Hi Rodrigo,
> >>
> >>> On Fri, Mar 24, 2023 at 04:31:22PM -0700, Dixit, Ashutosh wrote:
> >>>> On Fri, 24 Mar 2023 11:15:02 -0700, Belgaumkar, Vinay wrote:
> >>>>>
> >>>>
> >>>> Hi Vinay,
> >>>>
> >>>> Thanks for the review. Comments inline below.
> >>>>
> >>>>> On 3/15/2023 8:59 PM, Ashutosh Dixit wrote:
> >>>>>> On dGfx, the PL1 power limit being enabled and set to a low value results
> >>>>>> in a low GPU operating freq. It also negates the freq raise operation which
> >>>>>> is done before GuC firmware load. As a result GuC firmware load can time
> >>>>>> out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
> >>>>>> limit was enabled and set to a low value). Therefore disable the PL1 power
> >>>>>> limit when allowed by HW when loading GuC firmware.
> >>>>> v3 label missing in subject.
> >>>>>>
> >>>>>> v2:
> >>>>>>    - Take mutex (to disallow writes to power1_max) across GuC reset/fw load
> >>>>>>    - Add hwm_power_max_restore to error return code path
> >>>>>>
> >>>>>> v3 (Jani N):
> >>>>>>    - Add/remove explanatory comments
> >>>>>>    - Function renames
> >>>>>>    - Type corrections
> >>>>>>    - Locking annotation
> >>>>>>
> >>>>>> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
> >>>>>> Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> >>>>>> ---
> >>>>>>    drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 +++++++
> >>>>>>    drivers/gpu/drm/i915/i915_hwmon.c     | 39 +++++++++++++++++++++++++++
> >>>>>>    drivers/gpu/drm/i915/i915_hwmon.h     |  7 +++++
> >>>>>>    3 files changed, 55 insertions(+)
> >>>>>>
> >>>>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> >>>>>> index 4ccb4be4c9cba..aa8e35a5636a0 100644
> >>>>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> >>>>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> >>>>>> @@ -18,6 +18,7 @@
> >>>>>>    #include "intel_uc.h"
> >>>>>>      #include "i915_drv.h"
> >>>>>> +#include "i915_hwmon.h"
> >>>>>>      static const struct intel_uc_ops uc_ops_off;
> >>>>>>    static const struct intel_uc_ops uc_ops_on;
> >>>>>> @@ -461,6 +462,7 @@ static int __uc_init_hw(struct intel_uc *uc)
> >>>>>>	struct intel_guc *guc = &uc->guc;
> >>>>>>	struct intel_huc *huc = &uc->huc;
> >>>>>>	int ret, attempts;
> >>>>>> +	bool pl1en;
> >>>>>
> >>>>> Init to 'false' here
> >>>>
> >>>> See next comment.
> >>>>
> >>>>>
> >>>>>
> >>>>>>		GEM_BUG_ON(!intel_uc_supports_guc(uc));
> >>>>>>	GEM_BUG_ON(!intel_uc_wants_guc(uc));
> >>>>>> @@ -491,6 +493,9 @@ static int __uc_init_hw(struct intel_uc *uc)
> >>>>>>	else
> >>>>>>		attempts = 1;
> >>>>>>    +	/* Disable a potentially low PL1 power limit to allow freq to be
> >>>>>> raised */
> >>>>>> +	i915_hwmon_power_max_disable(gt->i915, &pl1en);
> >>>>>> +
> >>>>>>	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
> >>>>>>		while (attempts--) {
> >>>>>> @@ -547,6 +552,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> >>>>>>		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> >>>>>>	}
> >>>>>>    +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> >>>>>> +
> >>>>>>	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
> >>>>>>	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
> >>>>>>    @@ -563,6 +570,8 @@ static int __uc_init_hw(struct intel_uc *uc)
> >>>>>>	/* Return GT back to RPn */
> >>>>>>	intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
> >>>>>>    +	i915_hwmon_power_max_restore(gt->i915, pl1en);
> >>>>>
> >>>>> if (pl1en)
> >>>>>
> >>>>>      i915_hwmon_power_max_enable().
> >>>>
> >>>> IMO it's better not to have checks in the main __uc_init_hw() function (if
> >>>> we do this we'll need to add 2 checks in __uc_init_hw()). If you really
> >>>> want we could do something like this inside
> >>>> i915_hwmon_power_max_disable/i915_hwmon_power_max_restore. But for now I
> >>>> am not making any changes.
> >>>>
> >>>> (I can send a patch with the changes if you want to take a look but IMO it
> >>>> will add more logic/code but without real benefits (it will save a rmw if
> >>>> the limit was already disabled, but IMO this code is called so infrequently
> >>>> (only during GuC resets) as to not have any significant impact)).
> >>>>
> >>>>>
> >>>>>> +
> >>>>>>	__uc_sanitize(uc);
> >>>>>>		if (!ret) {
> >>>>>> diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> >>>>>> index ee63a8fd88fc1..769b5bda4d53f 100644
> >>>>>> --- a/drivers/gpu/drm/i915/i915_hwmon.c
> >>>>>> +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> >>>>>> @@ -444,6 +444,45 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
> >>>>>>	}
> >>>>>>    }
> >>>>>>    +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool
> >>>>>> *old)
> >>>>> Shouldn't we call this i915_hwmon_package_pl1_disable()?
> >>>>
> >>>> I did think of using "pl1" in the function name but then decided to retain
> >>>> "power_max" because other hwmon functions for PL1 limit also use
> >>>> "power_max" (hwm_power_max_read/hwm_power_max_write) and currently
> >>>> "hwmon_power_max" is mapped to the PL1 limit. So "power_max" is used to
> >>>> show that all these functions deal with the PL1 power limit.
> >>>>
> >>>> There is a comment in __uc_init_hw() explaining "power_max" means the PL1
> >>>> power limit.
> >>>>
> >>>>>> +	__acquires(i915->hwmon->hwmon_lock)
> >>>>>> +{
> >>>>>> +	struct i915_hwmon *hwmon = i915->hwmon;
> >>>>>> +	intel_wakeref_t wakeref;
> >>>>>> +	u32 r;
> >>>>>> +
> >>>>>> +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> >>>>>> +		return;
> >>>>>> +
> >>>>>> +	/* Take mutex to prevent concurrent hwm_power_max_write */
> >>>>>> +	mutex_lock(&hwmon->hwmon_lock);
> >>>>>> +
> >>>>>> +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> >>>>>> +		r = intel_uncore_rmw(hwmon->ddat.uncore,
> >>>>>> +				     hwmon->rg.pkg_rapl_limit,
> >>>>>> +				     PKG_PWR_LIM_1_EN, 0);
> >>>>> Most of this code (lock and rmw parts) is already inside static void
> >>>>> hwm_locked_with_pm_intel_uncore_rmw() , can we reuse that here?
> >>>>
> >>>> This was the case in v1 of the patch:
> >>>>
> >>>> https://patchwork.freedesktop.org/patch/526393/?series=115003&rev=1
> >>>>
> >>>> But now this cannot be done because if you notice we acquire the mutex in
> >>>> i915_hwmon_power_max_disable() and release the mutex in
> >>>> i915_hwmon_power_max_restore().
> >>>>
> >>>> I explained the reason why this the mutex is handled this way in my reply
> >>>> to Jani Nikula here:
> >>>>
> >>>> https://patchwork.freedesktop.org/patch/526598/?series=115003&rev=2
> >>>>
> >>>> Quoting below:
> >>>>
> >>>> ```
> >>>>>> +	/* hwmon_lock mutex is unlocked in hwm_power_max_restore */
> >>>>>
> >>>>> Not too happy about that... any better ideas?
> >>>>
> >>>> Afais, taking the mutex is the only fully correct solution (when we disable
> >>>> the power limit, userspace can go re-enable it). Examples of partly
> >>>> incorrect solutions (which don't take the mutex) include:
> >>>>
> >>>> a. Don't take the mutex, don't do anything, ignore any changes to the value
> >>>>     if it has changed during GuC reset/fw load (just overwrite the changed
> >>>>     value). Con: changed value is lost.
> >>>>
> >>>> b. Detect if the value has changed (the limit has been re-enabled) after we
> >>>>     have disabled the limit and in that case skip restoring the value. But
> >>>>     then someone can say why do we allow enabling the PL1 limit since we
> >>>>     want to disable it.
> >>>>
> >>>> Both these are very unlikely scenarios so they might work. But I would
> >>>> first like to explore if holding a mutex across GuC reset is prolebmatic
> >>>> since that is /the/ correct solution. But if anyone comes up with a reason
> >>>> why that cannot be done we can look at these other not completely correct
> >>>> options.
> >>>
> >>> I see what you are doing and it looks indeed a very safe approach to ensure
> >>> the pl1 won't be toggled by other paths while we need some guaranteed state
> >>> here, or hw init fails badly.
> >>>
> >>> But in the end you are making your lock to protect the code from another path
> >>> and not protecting the data itself. The data was already protected in the
> >>> first version with the lock in the rmw.
> >>
> >> Sorry I am not really following. Daniel had mentioned this "protecting code
> >> vs protecting data" but I am wondering how it is applicable in this
> >> case. IMO here the data we are protecting is the register which we don't
> >> want written to by userland while GuC load is in progress. To do that we
> >> need to block the code path writing to register. So what we have here seems
> >> to me to be the simplest and cleanest approach for solving this issue.
> >
> > I believe your cases here is exactly what Daniel had mentioned as protecting
> > code and not data. Well, in the end we are of course protecting data to be
> > modified, but in your case you use that mutex to also protect the code path
> > and avoid other calls while you are in this guc_init_path...
> >
> > Please Daniel, correct me here if I got it wrong.
> >
> > What I don't like here is that we lock from one function and keep that for a
> > while and unlock from the other function. To protect the data itself in general
> > we just need for a very minimal time while we are modifying the data itself.
> >
> >>
> >>> maybe we need to have some kind of a state check with other state-lock and
> >>> then if we are in this forced state for init path, the request for the normal path
> >>> ignores and move one,
> >>
> >> I don't see how this will *not* be racy...
> >
> > maybe something like this?:
> >
> > at power_max_disable:
> > mutex_lock(data_lock);
> >
> > mutex_lock(state_lock);
> > state = in_use;
> > mutex_unlock(state_lock);
> >
> > mmio_rmw();
> > mutex_unlock(data_lock);
> >
> >
> > at power_max_restoration:
> >
> > at power_max_disable:
> > mutex_lock(data_lock);
> >
> > mutex_lock(state_lock);
> > state = available;
> > mutex_unlock(state_lock);
> >
> > mmio_rmw();
> > mutex_unlock(data_lock);
> >
> > at sysfs fn:
> >
> > mutex_lock(data_lock);
> > mutex_lock(state_lock);
> > if (state == in_use) {
> >     ret = -EAGAIN
> >     goto out;
> > }
> > mutex_unlock(state_lock);
> >
> > ....
> >
> > out:
> >
> > mutex_unlock(data_lock);
>
> I agree holding the mutex across functions to cover the GuC init path is
> not the nicest pattern. Above looks a plausible improvement,

Agree, the above turned out rather nice.

> although I don't know if EAGAIN is correct for hwmon, or if blocking is.

Yes blocking is the correct uapi behavior, Patch 3/3 in the latest version
does this:

https://patchwork.freedesktop.org/series/116172/

> Is something expected to be configuring those fields during boot and can
> it even handle EAGAIN?
>
> One advantage of the solution from this patch I can see though is that I
> think it eliminates data races (restoring the stale value) with fw reload
> triggered by a potential full GPU reset happening in parallel to sysfs
> writes.
>
> Another thing to check would be if the inversions between
> hwmon_lock->rpm_get and rpm_get->hwmon_lock are okay.
>
> In fact, I am not sure rpm_get in this patch is needed? Seems to be running
> under paths which guarantee holding it already, if I am not missing
> something. If not needed then there is obviously no inversion in any way.

Yes, I checked and believe you are right. So I have eliminated rpm_get in
the disable/restore functions in the latest version.

Thanks.
--
Ashutosh

> P.S.
> Do some of the exiting mutex_lock need actually be mutex_lock_interruptible
> so sysfs reads/write can Ctrl-C, in theory at least.
>
>
> >>> or maybe we queue some request...
> >>
> >> Queuing a request will not be enough (even if this is possible), the
> >> request will need to wait to complete till GuC load completes. So we'll
> >> have to complete the request when GuC load completes, similar to releasing
> >> the mutex in the current patch. Looks like a much more complicated way of
> >> doing what the mutex does very simply.
> >
> > The wq would sleep/delay while state == in_use, then process the next request...
> >
> >>
> >> So:
> >>
> >> a. What is the real problem with the current implementation?
> >
> > probably the big lock used to protect the state machinery...
> >
> > but if other folks believe that we don't have an actual problem here
> > and this big lock is acceptable as long as it has the annotation for
> > the static analyzers, I'm okay to just let it go...
> >
> >
> >>
> >> b. What would be the correct solution for it? That is how, specifically,
> >>     should we implement it?
> >
> > state handling with separated lock from the data itself is my suggestion.
> >
> >>
> >> Some more guidance will be helpful if you think this patch has issues.
> >
> > I hope Daniel and/or other i915 maintainers can jump here. Specially if
> > I'm being to paranoid and the current patch is enough...
> >
> >>
> >> Thanks.
> >> --
> >> Ashutosh
> >>
> >>>> ```
> >>>>
> >>>>>> +
> >>>>>> +	*old = !!(r & PKG_PWR_LIM_1_EN);
> >>>>>> +}
> >>>>>> +
> >>>>>> +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
> >>>>>> +	__releases(i915->hwmon->hwmon_lock)
> >>>>> We can just call this i915_hwmon_power_max_enable() and call whenever the
> >>>>> old value was actually enabled. That way, we have proper mirror functions.
> >>>>
> >>>> As I explained that would mean adding two checks in the main __uc_init_hw()
> >>>> function which I am trying to avoid. So we have disable/restore pair.
> >>>>
> >>>>>> +{
> >>>>>> +	struct i915_hwmon *hwmon = i915->hwmon;
> >>>>>> +	intel_wakeref_t wakeref;
> >>>>>> +
> >>>>>> +	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
> >>>>>> +		return;
> >>>>>> +
> >>>>>> +	with_intel_runtime_pm(hwmon->ddat.uncore->rpm, wakeref)
> >>>>>> +		intel_uncore_rmw(hwmon->ddat.uncore,
> >>>>>> +				 hwmon->rg.pkg_rapl_limit,
> >>>>>> +				 PKG_PWR_LIM_1_EN,
> >>>>>> +				 old ? PKG_PWR_LIM_1_EN : 0);
> >>>>>
> >>>>> 3rd param should be 0 here, else we will end up clearing other bits.
> >>>>
> >>>> No see intel_uncore_rmw(), it will only clear the PKG_PWR_LIM_1_EN bit, so
> >>>> the code here is correct. intel_uncore_rmw() does:
> >>>>
> >>>>          val = (old & ~clear) | set;
> >>>>
> >>>> So for now I am not making any changes, if you feel strongly about
> >>>> something one way or another let me know. Anyway these comments should help
> >>>> you understand the patch better so take a look and we can go from there.
> >>>>
> >>>> Thanks.
> >>>> --
> >>>> Ashutosh
> >>>>
> >>>>>> +
> >>>>>> +	mutex_unlock(&hwmon->hwmon_lock);
> >>>>>> +}
> >>>>>> +
> >>>>>>    static umode_t
> >>>>>>    hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
> >>>>>>    {
> >>>>>> diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
> >>>>>> index 7ca9cf2c34c96..0fcb7de844061 100644
> >>>>>> --- a/drivers/gpu/drm/i915/i915_hwmon.h
> >>>>>> +++ b/drivers/gpu/drm/i915/i915_hwmon.h
> >>>>>> @@ -7,14 +7,21 @@
> >>>>>>    #ifndef __I915_HWMON_H__
> >>>>>>    #define __I915_HWMON_H__
> >>>>>>    +#include <linux/types.h>
> >>>>>> +
> >>>>>>    struct drm_i915_private;
> >>>>>> +struct intel_gt;
> >>>>>>      #if IS_REACHABLE(CONFIG_HWMON)
> >>>>>>    void i915_hwmon_register(struct drm_i915_private *i915);
> >>>>>>    void i915_hwmon_unregister(struct drm_i915_private *i915);
> >>>>>> +void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old);
> >>>>>> +void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old);
> >>>>>>    #else
> >>>>>>    static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
> >>>>>>    static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
> >>>>>> +static inline void i915_hwmon_power_max_disable(struct drm_i915_private *i915, bool *old) { };
> >>>>>> +static inline void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old) { };
> >>>>>>    #endif
> >>>>>>      #endif /* __I915_HWMON_H__ */

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
  2023-03-11  0:33 Ashutosh Dixit
@ 2023-03-11 20:46 ` Dixit, Ashutosh
  0 siblings, 0 replies; 25+ messages in thread
From: Dixit, Ashutosh @ 2023-03-11 20:46 UTC (permalink / raw)
  To: intel-gfx
  Cc: Vinay Belgaumkar, dri-devel, Badal Nilawar, John Harrison, Rodrigo Vivi

On Fri, 10 Mar 2023 16:33:58 -0800, Ashutosh Dixit wrote:
>
> On dGfx, the PL1 power limit being enabled and set to a low value results
> in a low GPU operating freq. It also negates the freq raise operation which
> is done before GuC firmware load. As a result GuC firmware load can time
> out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
> limit was enabled and set to a low value). Therefore disable the PL1 power
> limit when possible when loading GuC firmware.

There are a couple of bugs in the patch. Please don't review yet, will post
a v2. Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware
@ 2023-03-11  0:33 Ashutosh Dixit
  2023-03-11 20:46 ` Dixit, Ashutosh
  0 siblings, 1 reply; 25+ messages in thread
From: Ashutosh Dixit @ 2023-03-11  0:33 UTC (permalink / raw)
  To: intel-gfx
  Cc: Vinay Belgaumkar, John Harrison, Badal Nilawar, dri-devel, Rodrigo Vivi

On dGfx, the PL1 power limit being enabled and set to a low value results
in a low GPU operating freq. It also negates the freq raise operation which
is done before GuC firmware load. As a result GuC firmware load can time
out. Such timeouts were seen in the GL #8062 bug below (where the PL1 power
limit was enabled and set to a low value). Therefore disable the PL1 power
limit when possible when loading GuC firmware.

Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8062
Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_uc.c |  9 ++++++-
 drivers/gpu/drm/i915/i915_hwmon.c     | 34 +++++++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_hwmon.h     |  7 ++++++
 3 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
index 1b7ecd384a79..8794d54500d7 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -18,6 +18,7 @@
 #include "intel_uc.h"
 
 #include "i915_drv.h"
+#include "i915_hwmon.h"
 
 static const struct intel_uc_ops uc_ops_off;
 static const struct intel_uc_ops uc_ops_on;
@@ -460,7 +461,7 @@ static int __uc_init_hw(struct intel_uc *uc)
 	struct drm_i915_private *i915 = gt->i915;
 	struct intel_guc *guc = &uc->guc;
 	struct intel_huc *huc = &uc->huc;
-	int ret, attempts;
+	int ret, attempts, pl1en;
 
 	GEM_BUG_ON(!intel_uc_supports_guc(uc));
 	GEM_BUG_ON(!intel_uc_wants_guc(uc));
@@ -491,6 +492,9 @@ static int __uc_init_hw(struct intel_uc *uc)
 	else
 		attempts = 1;
 
+	/* Disable PL1 limit before raising freq when possible */
+	hwm_power_max_disable(gt, &pl1en);
+
 	intel_rps_raise_unslice(&uc_to_gt(uc)->rps);
 
 	while (attempts--) {
@@ -544,6 +548,9 @@ static int __uc_init_hw(struct intel_uc *uc)
 		intel_rps_lower_unslice(&uc_to_gt(uc)->rps);
 	}
 
+	/* Restore PL1 limit */
+	hwm_power_max_restore(gt, pl1en);
+
 	guc_info(guc, "submission %s\n", str_enabled_disabled(intel_uc_uses_guc_submission(uc)));
 	guc_info(guc, "SLPC %s\n", str_enabled_disabled(intel_uc_uses_guc_slpc(uc)));
 
diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
index ee63a8fd88fc..4ce3da7b7adc 100644
--- a/drivers/gpu/drm/i915/i915_hwmon.c
+++ b/drivers/gpu/drm/i915/i915_hwmon.c
@@ -62,20 +62,23 @@ struct i915_hwmon {
 	int scl_shift_time;
 };
 
-static void
+static u32
 hwm_locked_with_pm_intel_uncore_rmw(struct hwm_drvdata *ddat,
 				    i915_reg_t reg, u32 clear, u32 set)
 {
 	struct i915_hwmon *hwmon = ddat->hwmon;
 	struct intel_uncore *uncore = ddat->uncore;
 	intel_wakeref_t wakeref;
+	u32 old;
 
 	mutex_lock(&hwmon->hwmon_lock);
 
 	with_intel_runtime_pm(uncore->rpm, wakeref)
-		intel_uncore_rmw(uncore, reg, clear, set);
+		old = intel_uncore_rmw(uncore, reg, clear, set);
 
 	mutex_unlock(&hwmon->hwmon_lock);
+
+	return old;
 }
 
 /*
@@ -444,6 +447,33 @@ hwm_power_write(struct hwm_drvdata *ddat, u32 attr, int chan, long val)
 	}
 }
 
+void hwm_power_max_disable(struct intel_gt *gt, u32 *old)
+{
+	struct i915_hwmon *hwmon = gt->i915->hwmon;
+	u32 r;
+
+	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
+		return;
+
+	r = hwm_locked_with_pm_intel_uncore_rmw(&hwmon->ddat,
+						hwmon->rg.pkg_rapl_limit,
+						PKG_PWR_LIM_1_EN, 0);
+	*old = !!(r & PKG_PWR_LIM_1_EN);
+}
+
+void hwm_power_max_restore(struct intel_gt *gt, u32 old)
+{
+	struct i915_hwmon *hwmon = gt->i915->hwmon;
+
+	if (!hwmon || !i915_mmio_reg_valid(hwmon->rg.pkg_rapl_limit))
+		return;
+
+	hwm_locked_with_pm_intel_uncore_rmw(&hwmon->ddat,
+					    hwmon->rg.pkg_rapl_limit,
+					    PKG_PWR_LIM_1_EN,
+					    old ? PKG_PWR_LIM_1_EN : 0);
+}
+
 static umode_t
 hwm_energy_is_visible(const struct hwm_drvdata *ddat, u32 attr)
 {
diff --git a/drivers/gpu/drm/i915/i915_hwmon.h b/drivers/gpu/drm/i915/i915_hwmon.h
index 7ca9cf2c34c9..0c2db11be2e2 100644
--- a/drivers/gpu/drm/i915/i915_hwmon.h
+++ b/drivers/gpu/drm/i915/i915_hwmon.h
@@ -7,14 +7,21 @@
 #ifndef __I915_HWMON_H__
 #define __I915_HWMON_H__
 
+#include <linux/types.h>
+
 struct drm_i915_private;
+struct intel_gt;
 
 #if IS_REACHABLE(CONFIG_HWMON)
 void i915_hwmon_register(struct drm_i915_private *i915);
 void i915_hwmon_unregister(struct drm_i915_private *i915);
+void hwm_power_max_disable(struct intel_gt *gt, u32 *old);
+void hwm_power_max_restore(struct intel_gt *gt, u32 old);
 #else
 static inline void i915_hwmon_register(struct drm_i915_private *i915) { };
 static inline void i915_hwmon_unregister(struct drm_i915_private *i915) { };
+void hwm_power_max_disable(struct intel_gt *gt, u32 *old) { };
+void hwm_power_max_restore(struct intel_gt *gt, u32 old) { };
 #endif
 
 #endif /* __I915_HWMON_H__ */
-- 
2.38.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2023-04-06  4:52 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-16  3:59 [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware Ashutosh Dixit
2023-03-16  3:59 ` [Intel-gfx] " Ashutosh Dixit
2023-03-16  4:13 ` [Intel-gfx] ✗ Fi.CI.SPARSE: warning for drm/i915/guc: Disable PL1 power limit when loading GuC firmware (rev3) Patchwork
2023-03-16  4:37 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
2023-03-16  9:05 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork
2023-03-24 18:15 ` [PATCH] drm/i915/guc: Disable PL1 power limit when loading GuC firmware Belgaumkar, Vinay
2023-03-24 18:15   ` [Intel-gfx] " Belgaumkar, Vinay
2023-03-24 23:31   ` Dixit, Ashutosh
2023-03-24 23:31     ` [Intel-gfx] " Dixit, Ashutosh
2023-03-25  0:06     ` Belgaumkar, Vinay
2023-03-25  0:06       ` [Intel-gfx] " Belgaumkar, Vinay
2023-03-27 16:57       ` Dixit, Ashutosh
2023-03-27 16:57         ` [Intel-gfx] " Dixit, Ashutosh
2023-03-26 11:52     ` Rodrigo Vivi
2023-03-26 11:52       ` [Intel-gfx] " Rodrigo Vivi
2023-03-27 16:58       ` Dixit, Ashutosh
2023-03-27 16:58         ` [Intel-gfx] " Dixit, Ashutosh
2023-03-27 17:47         ` Rodrigo Vivi
2023-03-27 17:47           ` Rodrigo Vivi
2023-03-28  9:14           ` Tvrtko Ursulin
2023-04-06  4:52             ` Dixit, Ashutosh
2023-04-06  4:50           ` Dixit, Ashutosh
2023-04-06  4:50             ` Dixit, Ashutosh
  -- strict thread matches above, loose matches on Subject: below --
2023-03-11  0:33 Ashutosh Dixit
2023-03-11 20:46 ` Dixit, Ashutosh

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.