All of lore.kernel.org
 help / color / mirror / Atom feed
* [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
@ 2022-06-16 22:13 Nerlige Ramappa, Umesh
  2022-06-17  1:31 ` [Intel-gfx] ✗ Fi.CI.SPARSE: warning for i915/pmu: Wire GuC backend to per-client busyness (rev3) Patchwork
                   ` (3 more replies)
  0 siblings, 4 replies; 29+ messages in thread
From: Nerlige Ramappa, Umesh @ 2022-06-16 22:13 UTC (permalink / raw)
  To: intel-gfx

From: John Harrison <John.C.Harrison@Intel.com>

GuC provides engine_id and last_switch_in ticks for an active context in
the pphwsp. The context image provides a 32 bit total ticks which is the
accumulated by the context (a.k.a. context[CTX_TIMESTAMP]). This
information is used to calculate the context busyness as follows:

If the engine_id is valid, then busyness is the sum of accumulated total
ticks and active ticks. Active ticks is calculated with current gt time
as reference.

If engine_id is invalid, busyness is equal to accumulated total ticks.

Since KMD (CPU) retrieves busyness data from 2 sources - GPU and GuC, a
potential race was highlighted in an earlier review that can lead to
double accounting of busyness. While the solution to this is a wip,
busyness is still usable for platforms running GuC submission.

v2: (Tvrtko)
- Use COPS_RUNTIME_ACTIVE_TOTAL
- Add code comment for the race
- Undo local variables initializations

v3:
- Add support for virtual engines based on
  https://patchwork.freedesktop.org/series/105227/

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_context.c       | 12 +++-
 drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
 drivers/gpu/drm/i915/gt/intel_context_types.h |  6 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 65 ++++++++++++++++++-
 drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
 6 files changed, 89 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
index 4070cb5711d8..4a84146710e0 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -576,16 +576,24 @@ void intel_context_bind_parent_child(struct intel_context *parent,
 	child->parallel.parent = parent;
 }
 
-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
+u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
 {
 	u64 total, active;
 
+	if (ce->ops->update_stats)
+		ce->ops->update_stats(ce);
+
 	total = ce->stats.runtime.total;
 	if (ce->ops->flags & COPS_RUNTIME_CYCLES)
 		total *= ce->engine->gt->clock_period_ns;
 
 	active = READ_ONCE(ce->stats.active);
-	if (active)
+	/*
+	 * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the backend
+	 * already provides the total active time of the context, so skip this
+	 * calculation when this flag is set.
+	 */
+	if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
 		active = intel_context_clock() - active;
 
 	return total + active;
diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
index b7d3214d2cdd..5fc7c19ab29b 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.h
+++ b/drivers/gpu/drm/i915/gt/intel_context.h
@@ -56,7 +56,7 @@ static inline bool intel_context_is_parent(struct intel_context *ce)
 	return !!ce->parallel.number_children;
 }
 
-static inline bool intel_context_is_pinned(struct intel_context *ce);
+static inline bool intel_context_is_pinned(const struct intel_context *ce);
 
 static inline struct intel_context *
 intel_context_to_parent(struct intel_context *ce)
@@ -116,7 +116,7 @@ static inline int intel_context_lock_pinned(struct intel_context *ce)
  * Returns: true if the context is currently pinned for use by the GPU.
  */
 static inline bool
-intel_context_is_pinned(struct intel_context *ce)
+intel_context_is_pinned(const struct intel_context *ce)
 {
 	return atomic_read(&ce->pin_count);
 }
@@ -351,7 +351,7 @@ intel_context_clear_nopreempt(struct intel_context *ce)
 	clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
 }
 
-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce);
+u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
 u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
 
 static inline u64 intel_context_clock(void)
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
index 09f82545789f..797bb4242c18 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -38,6 +38,9 @@ struct intel_context_ops {
 #define COPS_RUNTIME_CYCLES_BIT 1
 #define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT)
 
+#define COPS_RUNTIME_ACTIVE_TOTAL_BIT 2
+#define COPS_RUNTIME_ACTIVE_TOTAL BIT(COPS_RUNTIME_ACTIVE_TOTAL_BIT)
+
 	int (*alloc)(struct intel_context *ce);
 
 	void (*ban)(struct intel_context *ce, struct i915_request *rq);
@@ -55,6 +58,8 @@ struct intel_context_ops {
 
 	void (*sched_disable)(struct intel_context *ce);
 
+	void (*update_stats)(struct intel_context *ce);
+
 	void (*reset)(struct intel_context *ce);
 	void (*destroy)(struct kref *kref);
 
@@ -146,6 +151,7 @@ struct intel_context {
 			struct ewma_runtime avg;
 			u64 total;
 			u32 last;
+			u64 start_gt_clk;
 			I915_SELFTEST_DECLARE(u32 num_underflow);
 			I915_SELFTEST_DECLARE(u32 max_underflow);
 		} runtime;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
index b3c9a9327f76..6231ad03e4eb 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
@@ -196,6 +196,11 @@ static inline u8 guc_class_to_engine_class(u8 guc_class)
 	return guc_class_engine_class_map[guc_class];
 }
 
+/* Per context engine usage stats: */
+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO	(0x500 / sizeof(u32))
+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO + 1)
+#define PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI + 1)
+
 /* Work item for submitting workloads into work queue of GuC. */
 struct guc_wq_item {
 	u32 header;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 5a1dfacf24ea..cbf3cbb983ce 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -378,7 +378,7 @@ static inline void set_context_guc_id_invalid(struct intel_context *ce)
 	ce->guc_id.id = GUC_INVALID_CONTEXT_ID;
 }
 
-static inline struct intel_guc *ce_to_guc(struct intel_context *ce)
+static inline struct intel_guc *ce_to_guc(const struct intel_context *ce)
 {
 	return &ce->engine->gt->uc.guc;
 }
@@ -1323,13 +1323,16 @@ static void __update_guc_busyness_stats(struct intel_guc *guc)
 	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
 }
 
+static void __guc_context_update_clks(struct intel_context *ce);
 static void guc_timestamp_ping(struct work_struct *wrk)
 {
 	struct intel_guc *guc = container_of(wrk, typeof(*guc),
 					     timestamp.work.work);
 	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
 	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_context *ce;
 	intel_wakeref_t wakeref;
+	unsigned long index;
 	int srcu, ret;
 
 	/*
@@ -1343,6 +1346,10 @@ static void guc_timestamp_ping(struct work_struct *wrk)
 	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
 		__update_guc_busyness_stats(guc);
 
+	/* adjust context stats for overflow */
+	xa_for_each(&guc->context_lookup, index, ce)
+		__guc_context_update_clks(ce);
+
 	intel_gt_reset_unlock(gt, srcu);
 
 	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
@@ -1405,6 +1412,56 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
 			 guc->timestamp.ping_delay);
 }
 
+static void __guc_context_update_clks(struct intel_context *ce)
+{
+	struct intel_guc *guc = ce_to_guc(ce);
+	struct intel_gt *gt = ce->engine->gt;
+	u32 *pphwsp, last_switch, engine_id;
+	u64 start_gt_clk, active;
+	unsigned long flags;
+	ktime_t unused;
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	/*
+	 * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context is switched
+	 * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
+	 * relies on GuC and GPU for busyness calculations. Due to this, A
+	 * potential race was highlighted in an earlier review that can lead to
+	 * double accounting of busyness. While the solution to this is a wip,
+	 * busyness is still usable for platforms running GuC submission.
+	 */
+	pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
+	last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
+	engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
+
+	guc_update_pm_timestamp(guc, &unused);
+
+	if (engine_id != 0xffffffff && last_switch) {
+		start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
+		__extend_last_switch(guc, &start_gt_clk, last_switch);
+		active = intel_gt_clock_interval_to_ns(gt, guc->timestamp.gt_stamp - start_gt_clk);
+		WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
+		WRITE_ONCE(ce->stats.active, active);
+	} else {
+		lrc_update_runtime(ce);
+	}
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
+static void guc_context_update_stats(struct intel_context *ce)
+{
+	if (!intel_context_pin_if_active(ce)) {
+		WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
+		WRITE_ONCE(ce->stats.active, 0);
+		return;
+	}
+
+	__guc_context_update_clks(ce);
+	intel_context_unpin(ce);
+}
+
 static inline bool
 submission_disabled(struct intel_guc *guc)
 {
@@ -2585,6 +2642,7 @@ static void guc_context_unpin(struct intel_context *ce)
 {
 	struct intel_guc *guc = ce_to_guc(ce);
 
+	lrc_update_runtime(ce);
 	unpin_guc_id(guc, ce);
 	lrc_unpin(ce);
 
@@ -3183,6 +3241,7 @@ static void remove_from_context(struct i915_request *rq)
 }
 
 static const struct intel_context_ops guc_context_ops = {
+	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
 	.alloc = guc_context_alloc,
 
 	.pre_pin = guc_context_pre_pin,
@@ -3199,6 +3258,8 @@ static const struct intel_context_ops guc_context_ops = {
 
 	.sched_disable = guc_context_sched_disable,
 
+	.update_stats = guc_context_update_stats,
+
 	.reset = lrc_reset,
 	.destroy = guc_context_destroy,
 
@@ -3432,6 +3493,7 @@ static int guc_virtual_context_alloc(struct intel_context *ce)
 }
 
 static const struct intel_context_ops virtual_guc_context_ops = {
+	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
 	.alloc = guc_virtual_context_alloc,
 
 	.pre_pin = guc_virtual_context_pre_pin,
@@ -3447,6 +3509,7 @@ static const struct intel_context_ops virtual_guc_context_ops = {
 	.exit = guc_virtual_context_exit,
 
 	.sched_disable = guc_context_sched_disable,
+	.update_stats = guc_context_update_stats,
 
 	.destroy = guc_context_destroy,
 
diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c
index 18d38cb59923..118db6f03f15 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -146,11 +146,7 @@ void i915_drm_client_fdinfo(struct seq_file *m, struct file *f)
 		   PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
 	seq_printf(m, "drm-client-id:\t%u\n", client->id);
 
-	/*
-	 * Temporarily skip showing client engine information with GuC submission till
-	 * fetching engine busyness is implemented in the GuC submission backend
-	 */
-	if (GRAPHICS_VER(i915) < 8 || intel_uc_uses_guc_submission(&i915->gt0.uc))
+	if (GRAPHICS_VER(i915) < 8)
 		return;
 
 	for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [Intel-gfx] ✗ Fi.CI.SPARSE: warning for i915/pmu: Wire GuC backend to per-client busyness (rev3)
  2022-06-16 22:13 [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness Nerlige Ramappa, Umesh
@ 2022-06-17  1:31 ` Patchwork
  2022-06-17  1:51 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 29+ messages in thread
From: Patchwork @ 2022-06-17  1:31 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx

== Series Details ==

Series: i915/pmu: Wire GuC backend to per-client busyness (rev3)
URL   : https://patchwork.freedesktop.org/series/105085/
State : warning

== Summary ==

Error: dim sparse failed
Sparse version: v0.6.2
Fast mode used, each commit won't be checked separately.



^ permalink raw reply	[flat|nested] 29+ messages in thread

* [Intel-gfx] ✓ Fi.CI.BAT: success for i915/pmu: Wire GuC backend to per-client busyness (rev3)
  2022-06-16 22:13 [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness Nerlige Ramappa, Umesh
  2022-06-17  1:31 ` [Intel-gfx] ✗ Fi.CI.SPARSE: warning for i915/pmu: Wire GuC backend to per-client busyness (rev3) Patchwork
@ 2022-06-17  1:51 ` Patchwork
  2022-06-17  8:00 ` [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness Tvrtko Ursulin
  2022-06-17 12:14 ` [Intel-gfx] ✗ Fi.CI.IGT: failure for i915/pmu: Wire GuC backend to per-client busyness (rev3) Patchwork
  3 siblings, 0 replies; 29+ messages in thread
From: Patchwork @ 2022-06-17  1:51 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx

[-- Attachment #1: Type: text/plain, Size: 3143 bytes --]

== Series Details ==

Series: i915/pmu: Wire GuC backend to per-client busyness (rev3)
URL   : https://patchwork.freedesktop.org/series/105085/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_11773 -> Patchwork_105085v3
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/index.html

Participating hosts (35 -> 33)
------------------------------

  Missing    (2): fi-rkl-11600 fi-bdw-samus 

Known issues
------------

  Here are the changes found in Patchwork_105085v3 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@i915_selftest@live@gem:
    - fi-pnv-d510:        NOTRUN -> [DMESG-FAIL][1] ([i915#4528])
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/fi-pnv-d510/igt@i915_selftest@live@gem.html

  * igt@kms_flip@basic-flip-vs-wf_vblank@a-edp1:
    - fi-tgl-u2:          [PASS][2] -> [DMESG-WARN][3] ([i915#402])
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/fi-tgl-u2/igt@kms_flip@basic-flip-vs-wf_vblank@a-edp1.html
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/fi-tgl-u2/igt@kms_flip@basic-flip-vs-wf_vblank@a-edp1.html

  
#### Possible fixes ####

  * igt@i915_selftest@live@gt_lrc:
    - fi-bsw-n3050:       [DMESG-FAIL][4] ([i915#2373]) -> [PASS][5]
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/fi-bsw-n3050/igt@i915_selftest@live@gt_lrc.html
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/fi-bsw-n3050/igt@i915_selftest@live@gt_lrc.html

  * igt@i915_selftest@live@requests:
    - fi-pnv-d510:        [DMESG-FAIL][6] ([i915#4528]) -> [PASS][7]
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/fi-pnv-d510/igt@i915_selftest@live@requests.html
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/fi-pnv-d510/igt@i915_selftest@live@requests.html

  * igt@kms_flip@basic-flip-vs-modeset@a-edp1:
    - fi-tgl-u2:          [DMESG-WARN][8] ([i915#402]) -> [PASS][9]
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/fi-tgl-u2/igt@kms_flip@basic-flip-vs-modeset@a-edp1.html
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/fi-tgl-u2/igt@kms_flip@basic-flip-vs-modeset@a-edp1.html

  
  [i915#2373]: https://gitlab.freedesktop.org/drm/intel/issues/2373
  [i915#402]: https://gitlab.freedesktop.org/drm/intel/issues/402
  [i915#4528]: https://gitlab.freedesktop.org/drm/intel/issues/4528


Build changes
-------------

  * Linux: CI_DRM_11773 -> Patchwork_105085v3

  CI-20190529: 20190529
  CI_DRM_11773: 8025a295b7aa707f64c7984b7781c6f25e22a901 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_6533: 6b5107d91827962808441db6b98e478aa9e67bdb @ https://gitlab.freedesktop.org/drm/igt-gpu-tools.git
  Patchwork_105085v3: 8025a295b7aa707f64c7984b7781c6f25e22a901 @ git://anongit.freedesktop.org/gfx-ci/linux


### Linux commits

753bc34ac867 i915/pmu: Wire GuC backend to per-client busyness

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/index.html

[-- Attachment #2: Type: text/html, Size: 4004 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-06-16 22:13 [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness Nerlige Ramappa, Umesh
  2022-06-17  1:31 ` [Intel-gfx] ✗ Fi.CI.SPARSE: warning for i915/pmu: Wire GuC backend to per-client busyness (rev3) Patchwork
  2022-06-17  1:51 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
@ 2022-06-17  8:00 ` Tvrtko Ursulin
  2022-07-27  6:01   ` Umesh Nerlige Ramappa
  2022-06-17 12:14 ` [Intel-gfx] ✗ Fi.CI.IGT: failure for i915/pmu: Wire GuC backend to per-client busyness (rev3) Patchwork
  3 siblings, 1 reply; 29+ messages in thread
From: Tvrtko Ursulin @ 2022-06-17  8:00 UTC (permalink / raw)
  To: Nerlige Ramappa, Umesh, intel-gfx


On 16/06/2022 23:13, Nerlige Ramappa, Umesh wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> GuC provides engine_id and last_switch_in ticks for an active context in
> the pphwsp. The context image provides a 32 bit total ticks which is the
> accumulated by the context (a.k.a. context[CTX_TIMESTAMP]). This
> information is used to calculate the context busyness as follows:
> 
> If the engine_id is valid, then busyness is the sum of accumulated total
> ticks and active ticks. Active ticks is calculated with current gt time
> as reference.
> 
> If engine_id is invalid, busyness is equal to accumulated total ticks.
> 
> Since KMD (CPU) retrieves busyness data from 2 sources - GPU and GuC, a
> potential race was highlighted in an earlier review that can lead to
> double accounting of busyness. While the solution to this is a wip,
> busyness is still usable for platforms running GuC submission.
> 
> v2: (Tvrtko)
> - Use COPS_RUNTIME_ACTIVE_TOTAL
> - Add code comment for the race
> - Undo local variables initializations
> 
> v3:
> - Add support for virtual engines based on
>    https://patchwork.freedesktop.org/series/105227/
> 
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_context.c       | 12 +++-
>   drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
>   drivers/gpu/drm/i915/gt/intel_context_types.h |  6 ++
>   drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 65 ++++++++++++++++++-
>   drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
>   6 files changed, 89 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
> index 4070cb5711d8..4a84146710e0 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.c
> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> @@ -576,16 +576,24 @@ void intel_context_bind_parent_child(struct intel_context *parent,
>   	child->parallel.parent = parent;
>   }
>   
> -u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>   {
>   	u64 total, active;
>   
> +	if (ce->ops->update_stats)
> +		ce->ops->update_stats(ce);
> +
>   	total = ce->stats.runtime.total;
>   	if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>   		total *= ce->engine->gt->clock_period_ns;
>   
>   	active = READ_ONCE(ce->stats.active);
> -	if (active)
> +	/*
> +	 * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the backend
> +	 * already provides the total active time of the context, so skip this
> +	 * calculation when this flag is set.
> +	 */
> +	if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
>   		active = intel_context_clock() - active;
>   
>   	return total + active;
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
> index b7d3214d2cdd..5fc7c19ab29b 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context.h
> @@ -56,7 +56,7 @@ static inline bool intel_context_is_parent(struct intel_context *ce)
>   	return !!ce->parallel.number_children;
>   }
>   
> -static inline bool intel_context_is_pinned(struct intel_context *ce);
> +static inline bool intel_context_is_pinned(const struct intel_context *ce);
>   
>   static inline struct intel_context *
>   intel_context_to_parent(struct intel_context *ce)
> @@ -116,7 +116,7 @@ static inline int intel_context_lock_pinned(struct intel_context *ce)
>    * Returns: true if the context is currently pinned for use by the GPU.
>    */
>   static inline bool
> -intel_context_is_pinned(struct intel_context *ce)
> +intel_context_is_pinned(const struct intel_context *ce)
>   {
>   	return atomic_read(&ce->pin_count);
>   }
> @@ -351,7 +351,7 @@ intel_context_clear_nopreempt(struct intel_context *ce)
>   	clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
>   }
>   
> -u64 intel_context_get_total_runtime_ns(const struct intel_context *ce);
> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
>   u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
>   
>   static inline u64 intel_context_clock(void)
> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
> index 09f82545789f..797bb4242c18 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> @@ -38,6 +38,9 @@ struct intel_context_ops {
>   #define COPS_RUNTIME_CYCLES_BIT 1
>   #define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT)
>   
> +#define COPS_RUNTIME_ACTIVE_TOTAL_BIT 2
> +#define COPS_RUNTIME_ACTIVE_TOTAL BIT(COPS_RUNTIME_ACTIVE_TOTAL_BIT)
> +
>   	int (*alloc)(struct intel_context *ce);
>   
>   	void (*ban)(struct intel_context *ce, struct i915_request *rq);
> @@ -55,6 +58,8 @@ struct intel_context_ops {
>   
>   	void (*sched_disable)(struct intel_context *ce);
>   
> +	void (*update_stats)(struct intel_context *ce);
> +
>   	void (*reset)(struct intel_context *ce);
>   	void (*destroy)(struct kref *kref);
>   
> @@ -146,6 +151,7 @@ struct intel_context {
>   			struct ewma_runtime avg;
>   			u64 total;
>   			u32 last;
> +			u64 start_gt_clk;
>   			I915_SELFTEST_DECLARE(u32 num_underflow);
>   			I915_SELFTEST_DECLARE(u32 max_underflow);
>   		} runtime;
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> index b3c9a9327f76..6231ad03e4eb 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> @@ -196,6 +196,11 @@ static inline u8 guc_class_to_engine_class(u8 guc_class)
>   	return guc_class_engine_class_map[guc_class];
>   }
>   
> +/* Per context engine usage stats: */
> +#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO	(0x500 / sizeof(u32))
> +#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO + 1)
> +#define PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI + 1)
> +
>   /* Work item for submitting workloads into work queue of GuC. */
>   struct guc_wq_item {
>   	u32 header;
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 5a1dfacf24ea..cbf3cbb983ce 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -378,7 +378,7 @@ static inline void set_context_guc_id_invalid(struct intel_context *ce)
>   	ce->guc_id.id = GUC_INVALID_CONTEXT_ID;
>   }
>   
> -static inline struct intel_guc *ce_to_guc(struct intel_context *ce)
> +static inline struct intel_guc *ce_to_guc(const struct intel_context *ce)
>   {
>   	return &ce->engine->gt->uc.guc;
>   }
> @@ -1323,13 +1323,16 @@ static void __update_guc_busyness_stats(struct intel_guc *guc)
>   	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>   }
>   
> +static void __guc_context_update_clks(struct intel_context *ce);
>   static void guc_timestamp_ping(struct work_struct *wrk)
>   {
>   	struct intel_guc *guc = container_of(wrk, typeof(*guc),
>   					     timestamp.work.work);
>   	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
>   	struct intel_gt *gt = guc_to_gt(guc);
> +	struct intel_context *ce;
>   	intel_wakeref_t wakeref;
> +	unsigned long index;
>   	int srcu, ret;
>   
>   	/*
> @@ -1343,6 +1346,10 @@ static void guc_timestamp_ping(struct work_struct *wrk)
>   	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>   		__update_guc_busyness_stats(guc);
>   
> +	/* adjust context stats for overflow */
> +	xa_for_each(&guc->context_lookup, index, ce)
> +		__guc_context_update_clks(ce);
> +
>   	intel_gt_reset_unlock(gt, srcu);
>   
>   	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
> @@ -1405,6 +1412,56 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
>   			 guc->timestamp.ping_delay);
>   }
>   
> +static void __guc_context_update_clks(struct intel_context *ce)
> +{
> +	struct intel_guc *guc = ce_to_guc(ce);
> +	struct intel_gt *gt = ce->engine->gt;
> +	u32 *pphwsp, last_switch, engine_id;
> +	u64 start_gt_clk, active;
> +	unsigned long flags;
> +	ktime_t unused;
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	/*
> +	 * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context is switched
> +	 * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
> +	 * relies on GuC and GPU for busyness calculations. Due to this, A
> +	 * potential race was highlighted in an earlier review that can lead to
> +	 * double accounting of busyness. While the solution to this is a wip,
> +	 * busyness is still usable for platforms running GuC submission.
> +	 */
> +	pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
> +	last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
> +	engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
> +
> +	guc_update_pm_timestamp(guc, &unused);
> +
> +	if (engine_id != 0xffffffff && last_switch) {
> +		start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
> +		__extend_last_switch(guc, &start_gt_clk, last_switch);
> +		active = intel_gt_clock_interval_to_ns(gt, guc->timestamp.gt_stamp - start_gt_clk);
> +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
> +		WRITE_ONCE(ce->stats.active, active);
> +	} else {
> +		lrc_update_runtime(ce);
> +	}
> +
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +}
> +
> +static void guc_context_update_stats(struct intel_context *ce)
> +{
> +	if (!intel_context_pin_if_active(ce)) {
> +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
> +		WRITE_ONCE(ce->stats.active, 0);
> +		return;
> +	}
> +
> +	__guc_context_update_clks(ce);
> +	intel_context_unpin(ce);
> +}
> +
>   static inline bool
>   submission_disabled(struct intel_guc *guc)
>   {
> @@ -2585,6 +2642,7 @@ static void guc_context_unpin(struct intel_context *ce)
>   {
>   	struct intel_guc *guc = ce_to_guc(ce);
>   
> +	lrc_update_runtime(ce);
>   	unpin_guc_id(guc, ce);
>   	lrc_unpin(ce);
>   
> @@ -3183,6 +3241,7 @@ static void remove_from_context(struct i915_request *rq)
>   }
>   
>   static const struct intel_context_ops guc_context_ops = {
> +	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>   	.alloc = guc_context_alloc,
>   
>   	.pre_pin = guc_context_pre_pin,
> @@ -3199,6 +3258,8 @@ static const struct intel_context_ops guc_context_ops = {
>   
>   	.sched_disable = guc_context_sched_disable,
>   
> +	.update_stats = guc_context_update_stats,
> +
>   	.reset = lrc_reset,
>   	.destroy = guc_context_destroy,
>   
> @@ -3432,6 +3493,7 @@ static int guc_virtual_context_alloc(struct intel_context *ce)
>   }
>   
>   static const struct intel_context_ops virtual_guc_context_ops = {
> +	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>   	.alloc = guc_virtual_context_alloc,
>   
>   	.pre_pin = guc_virtual_context_pre_pin,
> @@ -3447,6 +3509,7 @@ static const struct intel_context_ops virtual_guc_context_ops = {
>   	.exit = guc_virtual_context_exit,
>   
>   	.sched_disable = guc_context_sched_disable,
> +	.update_stats = guc_context_update_stats,

There are also virtual_parent_context_ops and virtual_child_context_ops 
- which means more test coverage is needed..

Regards,

Tvrtko

>   
>   	.destroy = guc_context_destroy,
>   
> diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c
> index 18d38cb59923..118db6f03f15 100644
> --- a/drivers/gpu/drm/i915/i915_drm_client.c
> +++ b/drivers/gpu/drm/i915/i915_drm_client.c
> @@ -146,11 +146,7 @@ void i915_drm_client_fdinfo(struct seq_file *m, struct file *f)
>   		   PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
>   	seq_printf(m, "drm-client-id:\t%u\n", client->id);
>   
> -	/*
> -	 * Temporarily skip showing client engine information with GuC submission till
> -	 * fetching engine busyness is implemented in the GuC submission backend
> -	 */
> -	if (GRAPHICS_VER(i915) < 8 || intel_uc_uses_guc_submission(&i915->gt0.uc))
> +	if (GRAPHICS_VER(i915) < 8)
>   		return;
>   
>   	for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [Intel-gfx] ✗ Fi.CI.IGT: failure for i915/pmu: Wire GuC backend to per-client busyness (rev3)
  2022-06-16 22:13 [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness Nerlige Ramappa, Umesh
                   ` (2 preceding siblings ...)
  2022-06-17  8:00 ` [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness Tvrtko Ursulin
@ 2022-06-17 12:14 ` Patchwork
  3 siblings, 0 replies; 29+ messages in thread
From: Patchwork @ 2022-06-17 12:14 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx

[-- Attachment #1: Type: text/plain, Size: 39165 bytes --]

== Series Details ==

Series: i915/pmu: Wire GuC backend to per-client busyness (rev3)
URL   : https://patchwork.freedesktop.org/series/105085/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_11773_full -> Patchwork_105085v3_full
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_105085v3_full absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_105085v3_full, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  

Participating hosts (10 -> 10)
------------------------------

  No changes in participating hosts

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_105085v3_full:

### IGT changes ###

#### Possible regressions ####

  * igt@gem_exec_whisper@basic-sync:
    - shard-glk:          [PASS][1] -> [INCOMPLETE][2]
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-glk7/igt@gem_exec_whisper@basic-sync.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-glk3/igt@gem_exec_whisper@basic-sync.html

  * igt@kms_concurrent@pipe-b:
    - shard-glk:          NOTRUN -> [TIMEOUT][3]
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-glk3/igt@kms_concurrent@pipe-b.html

  * igt@perf@enable-disable:
    - shard-glk:          [PASS][4] -> [TIMEOUT][5] +1 similar issue
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-glk7/igt@perf@enable-disable.html
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-glk3/igt@perf@enable-disable.html

  
#### Warnings ####

  * igt@kms_chamelium@dp-crc-single:
    - shard-glk:          [SKIP][6] ([fdo#109271] / [fdo#111827]) -> [TIMEOUT][7]
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-glk7/igt@kms_chamelium@dp-crc-single.html
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-glk3/igt@kms_chamelium@dp-crc-single.html

  * igt@kms_frontbuffer_tracking@psr-rgb565-draw-blt:
    - shard-glk:          [SKIP][8] ([fdo#109271]) -> [INCOMPLETE][9]
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-glk7/igt@kms_frontbuffer_tracking@psr-rgb565-draw-blt.html
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-glk3/igt@kms_frontbuffer_tracking@psr-rgb565-draw-blt.html

  
Known issues
------------

  Here are the changes found in Patchwork_105085v3_full that come from known issues:

### CI changes ###

#### Issues hit ####

  * boot:
    - shard-apl:          ([PASS][10], [PASS][11], [PASS][12], [PASS][13], [PASS][14], [PASS][15], [PASS][16], [PASS][17], [PASS][18], [PASS][19], [PASS][20], [PASS][21], [PASS][22], [PASS][23], [PASS][24], [PASS][25], [PASS][26], [PASS][27], [PASS][28], [PASS][29], [PASS][30], [PASS][31], [PASS][32], [PASS][33], [PASS][34]) -> ([PASS][35], [PASS][36], [PASS][37], [PASS][38], [PASS][39], [PASS][40], [FAIL][41], [PASS][42], [PASS][43], [PASS][44], [PASS][45], [PASS][46], [PASS][47], [PASS][48], [PASS][49], [PASS][50], [PASS][51], [PASS][52], [PASS][53], [PASS][54], [PASS][55], [PASS][56], [PASS][57], [PASS][58], [PASS][59]) ([i915#4386])
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl2/boot.html
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl6/boot.html
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl6/boot.html
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl4/boot.html
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl4/boot.html
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl1/boot.html
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl4/boot.html
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl3/boot.html
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl3/boot.html
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl3/boot.html
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl2/boot.html
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl2/boot.html
   [22]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl2/boot.html
   [23]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl1/boot.html
   [24]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl8/boot.html
   [25]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl1/boot.html
   [26]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl8/boot.html
   [27]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl8/boot.html
   [28]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl8/boot.html
   [29]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl8/boot.html
   [30]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl1/boot.html
   [31]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl7/boot.html
   [32]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl7/boot.html
   [33]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl7/boot.html
   [34]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl6/boot.html
   [35]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl8/boot.html
   [36]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl8/boot.html
   [37]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl8/boot.html
   [38]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl1/boot.html
   [39]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl1/boot.html
   [40]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl1/boot.html
   [41]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl1/boot.html
   [42]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl2/boot.html
   [43]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl2/boot.html
   [44]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl2/boot.html
   [45]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl3/boot.html
   [46]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl3/boot.html
   [47]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl3/boot.html
   [48]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl3/boot.html
   [49]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl4/boot.html
   [50]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl4/boot.html
   [51]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl4/boot.html
   [52]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl6/boot.html
   [53]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl6/boot.html
   [54]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl6/boot.html
   [55]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl6/boot.html
   [56]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl7/boot.html
   [57]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl7/boot.html
   [58]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl7/boot.html
   [59]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl7/boot.html

  
#### Possible fixes ####

  * boot:
    - shard-skl:          ([PASS][60], [PASS][61], [PASS][62], [PASS][63], [PASS][64], [PASS][65], [PASS][66], [PASS][67], [PASS][68], [PASS][69], [PASS][70], [FAIL][71], [PASS][72], [PASS][73], [PASS][74], [PASS][75], [PASS][76], [PASS][77], [PASS][78], [PASS][79], [PASS][80], [PASS][81]) ([i915#5032]) -> ([PASS][82], [PASS][83], [PASS][84], [PASS][85], [PASS][86], [PASS][87], [PASS][88], [PASS][89], [PASS][90], [PASS][91], [PASS][92], [PASS][93], [PASS][94], [PASS][95], [PASS][96], [PASS][97], [PASS][98], [PASS][99], [PASS][100], [PASS][101], [PASS][102], [PASS][103], [PASS][104], [PASS][105])
   [60]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl9/boot.html
   [61]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl9/boot.html
   [62]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl9/boot.html
   [63]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl7/boot.html
   [64]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl7/boot.html
   [65]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl6/boot.html
   [66]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl6/boot.html
   [67]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl6/boot.html
   [68]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl4/boot.html
   [69]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl4/boot.html
   [70]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl4/boot.html
   [71]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl3/boot.html
   [72]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl3/boot.html
   [73]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl2/boot.html
   [74]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl2/boot.html
   [75]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl1/boot.html
   [76]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl1/boot.html
   [77]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl1/boot.html
   [78]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl1/boot.html
   [79]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl10/boot.html
   [80]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl10/boot.html
   [81]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl10/boot.html
   [82]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl9/boot.html
   [83]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl9/boot.html
   [84]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl9/boot.html
   [85]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl7/boot.html
   [86]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl7/boot.html
   [87]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl7/boot.html
   [88]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl7/boot.html
   [89]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl6/boot.html
   [90]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl6/boot.html
   [91]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl4/boot.html
   [92]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl4/boot.html
   [93]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl4/boot.html
   [94]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl3/boot.html
   [95]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl3/boot.html
   [96]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl3/boot.html
   [97]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl2/boot.html
   [98]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl2/boot.html
   [99]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl2/boot.html
   [100]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl1/boot.html
   [101]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl1/boot.html
   [102]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl1/boot.html
   [103]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl10/boot.html
   [104]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl10/boot.html
   [105]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl10/boot.html

  

### IGT changes ###

#### Issues hit ####

  * igt@gem_ctx_isolation@preservation-s3@rcs0:
    - shard-skl:          [PASS][106] -> [INCOMPLETE][107] ([i915#4793])
   [106]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl4/igt@gem_ctx_isolation@preservation-s3@rcs0.html
   [107]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl3/igt@gem_ctx_isolation@preservation-s3@rcs0.html

  * igt@gem_eio@in-flight-10ms:
    - shard-tglb:         [PASS][108] -> [TIMEOUT][109] ([i915#3063])
   [108]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-tglb3/igt@gem_eio@in-flight-10ms.html
   [109]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-tglb3/igt@gem_eio@in-flight-10ms.html

  * igt@gem_exec_balancer@parallel-bb-first:
    - shard-iclb:         [PASS][110] -> [SKIP][111] ([i915#4525]) +1 similar issue
   [110]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-iclb2/igt@gem_exec_balancer@parallel-bb-first.html
   [111]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb6/igt@gem_exec_balancer@parallel-bb-first.html

  * igt@gem_exec_fair@basic-none-share@rcs0:
    - shard-iclb:         [PASS][112] -> [FAIL][113] ([i915#2842])
   [112]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-iclb8/igt@gem_exec_fair@basic-none-share@rcs0.html
   [113]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb3/igt@gem_exec_fair@basic-none-share@rcs0.html

  * igt@gem_exec_fair@basic-none@vecs0:
    - shard-kbl:          [PASS][114] -> [FAIL][115] ([i915#2842]) +2 similar issues
   [114]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-kbl3/igt@gem_exec_fair@basic-none@vecs0.html
   [115]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-kbl6/igt@gem_exec_fair@basic-none@vecs0.html
    - shard-apl:          [PASS][116] -> [FAIL][117] ([i915#2842])
   [116]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl8/igt@gem_exec_fair@basic-none@vecs0.html
   [117]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl8/igt@gem_exec_fair@basic-none@vecs0.html

  * igt@gem_exec_fair@basic-pace@vcs1:
    - shard-iclb:         NOTRUN -> [FAIL][118] ([i915#2842])
   [118]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb4/igt@gem_exec_fair@basic-pace@vcs1.html

  * igt@gem_huc_copy@huc-copy:
    - shard-skl:          NOTRUN -> [SKIP][119] ([fdo#109271] / [i915#2190])
   [119]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl3/igt@gem_huc_copy@huc-copy.html

  * igt@gem_lmem_swapping@heavy-random:
    - shard-skl:          NOTRUN -> [SKIP][120] ([fdo#109271] / [i915#4613]) +1 similar issue
   [120]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl2/igt@gem_lmem_swapping@heavy-random.html

  * igt@gem_userptr_blits@dmabuf-sync:
    - shard-skl:          NOTRUN -> [SKIP][121] ([fdo#109271] / [i915#3323])
   [121]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl2/igt@gem_userptr_blits@dmabuf-sync.html

  * igt@gem_userptr_blits@input-checking:
    - shard-skl:          NOTRUN -> [DMESG-WARN][122] ([i915#4991])
   [122]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl3/igt@gem_userptr_blits@input-checking.html

  * igt@gen9_exec_parse@allowed-all:
    - shard-glk:          [PASS][123] -> [DMESG-WARN][124] ([i915#5566] / [i915#716])
   [123]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-glk3/igt@gen9_exec_parse@allowed-all.html
   [124]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-glk9/igt@gen9_exec_parse@allowed-all.html

  * igt@i915_pm_dc@dc6-psr:
    - shard-skl:          NOTRUN -> [FAIL][125] ([i915#454])
   [125]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl3/igt@i915_pm_dc@dc6-psr.html

  * igt@i915_suspend@fence-restore-tiled2untiled:
    - shard-apl:          [PASS][126] -> [DMESG-WARN][127] ([i915#180]) +3 similar issues
   [126]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl7/igt@i915_suspend@fence-restore-tiled2untiled.html
   [127]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl3/igt@i915_suspend@fence-restore-tiled2untiled.html

  * igt@kms_ccs@pipe-a-crc-primary-rotation-180-4_tiled_dg2_mc_ccs:
    - shard-apl:          NOTRUN -> [SKIP][128] ([fdo#109271]) +5 similar issues
   [128]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl8/igt@kms_ccs@pipe-a-crc-primary-rotation-180-4_tiled_dg2_mc_ccs.html

  * igt@kms_ccs@pipe-b-bad-rotation-90-y_tiled_gen12_rc_ccs_cc:
    - shard-skl:          NOTRUN -> [SKIP][129] ([fdo#109271] / [i915#3886]) +4 similar issues
   [129]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl10/igt@kms_ccs@pipe-b-bad-rotation-90-y_tiled_gen12_rc_ccs_cc.html

  * igt@kms_color_chamelium@pipe-b-ctm-0-25:
    - shard-skl:          NOTRUN -> [SKIP][130] ([fdo#109271] / [fdo#111827]) +11 similar issues
   [130]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl2/igt@kms_color_chamelium@pipe-b-ctm-0-25.html

  * igt@kms_cursor_crc@pipe-c-cursor-suspend:
    - shard-kbl:          [PASS][131] -> [DMESG-WARN][132] ([i915#180]) +3 similar issues
   [131]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-kbl4/igt@kms_cursor_crc@pipe-c-cursor-suspend.html
   [132]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-kbl6/igt@kms_cursor_crc@pipe-c-cursor-suspend.html

  * igt@kms_cursor_legacy@cursor-vs-flip-atomic-transitions:
    - shard-iclb:         [PASS][133] -> [FAIL][134] ([i915#5072])
   [133]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-iclb8/igt@kms_cursor_legacy@cursor-vs-flip-atomic-transitions.html
   [134]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb7/igt@kms_cursor_legacy@cursor-vs-flip-atomic-transitions.html

  * igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions:
    - shard-skl:          NOTRUN -> [FAIL][135] ([i915#2346])
   [135]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl3/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions.html

  * igt@kms_cursor_legacy@pipe-d-torture-bo:
    - shard-skl:          NOTRUN -> [SKIP][136] ([fdo#109271] / [i915#533])
   [136]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl2/igt@kms_cursor_legacy@pipe-d-torture-bo.html

  * igt@kms_dither@fb-8bpc-vs-panel-8bpc@pipe-a-hdmi-a-1:
    - shard-glk:          [PASS][137] -> [SKIP][138] ([fdo#109271])
   [137]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-glk5/igt@kms_dither@fb-8bpc-vs-panel-8bpc@pipe-a-hdmi-a-1.html
   [138]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-glk8/igt@kms_dither@fb-8bpc-vs-panel-8bpc@pipe-a-hdmi-a-1.html

  * igt@kms_flip@flip-vs-expired-vblank-interruptible@b-hdmi-a1:
    - shard-glk:          [PASS][139] -> [FAIL][140] ([i915#79])
   [139]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-glk4/igt@kms_flip@flip-vs-expired-vblank-interruptible@b-hdmi-a1.html
   [140]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-glk6/igt@kms_flip@flip-vs-expired-vblank-interruptible@b-hdmi-a1.html

  * igt@kms_flip@flip-vs-expired-vblank@b-edp1:
    - shard-skl:          [PASS][141] -> [FAIL][142] ([i915#79])
   [141]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl6/igt@kms_flip@flip-vs-expired-vblank@b-edp1.html
   [142]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl9/igt@kms_flip@flip-vs-expired-vblank@b-edp1.html

  * igt@kms_flip@plain-flip-ts-check@a-edp1:
    - shard-skl:          [PASS][143] -> [FAIL][144] ([i915#2122])
   [143]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl1/igt@kms_flip@plain-flip-ts-check@a-edp1.html
   [144]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl7/igt@kms_flip@plain-flip-ts-check@a-edp1.html

  * igt@kms_flip_scaled_crc@flip-64bpp-ytile-to-32bpp-ytile-downscaling:
    - shard-skl:          NOTRUN -> [SKIP][145] ([fdo#109271] / [i915#3701]) +1 similar issue
   [145]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl3/igt@kms_flip_scaled_crc@flip-64bpp-ytile-to-32bpp-ytile-downscaling.html

  * igt@kms_frontbuffer_tracking@psr-1p-primscrn-indfb-msflip-blt:
    - shard-iclb:         [PASS][146] -> [FAIL][147] ([i915#1888])
   [146]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-iclb1/igt@kms_frontbuffer_tracking@psr-1p-primscrn-indfb-msflip-blt.html
   [147]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb3/igt@kms_frontbuffer_tracking@psr-1p-primscrn-indfb-msflip-blt.html

  * igt@kms_hdr@bpc-switch@pipe-a-dp-1:
    - shard-kbl:          [PASS][148] -> [FAIL][149] ([i915#1188])
   [148]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-kbl4/igt@kms_hdr@bpc-switch@pipe-a-dp-1.html
   [149]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-kbl6/igt@kms_hdr@bpc-switch@pipe-a-dp-1.html

  * igt@kms_plane_alpha_blend@pipe-a-alpha-transparent-fb:
    - shard-skl:          NOTRUN -> [FAIL][150] ([i915#265])
   [150]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl4/igt@kms_plane_alpha_blend@pipe-a-alpha-transparent-fb.html

  * igt@kms_plane_scaling@plane-scaler-with-clipping-clamping-pixel-formats@pipe-b-edp-1:
    - shard-iclb:         [PASS][151] -> [SKIP][152] ([i915#5176]) +1 similar issue
   [151]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-iclb1/igt@kms_plane_scaling@plane-scaler-with-clipping-clamping-pixel-formats@pipe-b-edp-1.html
   [152]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb3/igt@kms_plane_scaling@plane-scaler-with-clipping-clamping-pixel-formats@pipe-b-edp-1.html

  * igt@kms_psr2_sf@overlay-plane-update-sf-dmg-area:
    - shard-skl:          NOTRUN -> [SKIP][153] ([fdo#109271] / [i915#658]) +2 similar issues
   [153]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl4/igt@kms_psr2_sf@overlay-plane-update-sf-dmg-area.html

  * igt@kms_psr@psr2_no_drrs:
    - shard-iclb:         [PASS][154] -> [SKIP][155] ([fdo#109441]) +1 similar issue
   [154]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-iclb2/igt@kms_psr@psr2_no_drrs.html
   [155]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb6/igt@kms_psr@psr2_no_drrs.html

  * igt@kms_rotation_crc@primary-4-tiled-reflect-x-180:
    - shard-skl:          NOTRUN -> [SKIP][156] ([fdo#109271]) +188 similar issues
   [156]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl2/igt@kms_rotation_crc@primary-4-tiled-reflect-x-180.html

  * igt@kms_vblank@pipe-a-ts-continuation-dpms-suspend:
    - shard-kbl:          [PASS][157] -> [INCOMPLETE][158] ([i915#3614])
   [157]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-kbl4/igt@kms_vblank@pipe-a-ts-continuation-dpms-suspend.html
   [158]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-kbl6/igt@kms_vblank@pipe-a-ts-continuation-dpms-suspend.html

  * igt@perf@polling-small-buf:
    - shard-skl:          [PASS][159] -> [FAIL][160] ([i915#1722])
   [159]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl9/igt@perf@polling-small-buf.html
   [160]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl10/igt@perf@polling-small-buf.html

  * igt@sysfs_clients@sema-50:
    - shard-skl:          NOTRUN -> [SKIP][161] ([fdo#109271] / [i915#2994])
   [161]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl3/igt@sysfs_clients@sema-50.html

  
#### Possible fixes ####

  * igt@gem_exec_balancer@parallel-keep-in-fence:
    - shard-iclb:         [SKIP][162] ([i915#4525]) -> [PASS][163] +2 similar issues
   [162]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-iclb5/igt@gem_exec_balancer@parallel-keep-in-fence.html
   [163]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb1/igt@gem_exec_balancer@parallel-keep-in-fence.html

  * igt@gem_exec_fair@basic-none@vcs0:
    - shard-kbl:          [FAIL][164] ([i915#2842]) -> [PASS][165] +4 similar issues
   [164]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-kbl3/igt@gem_exec_fair@basic-none@vcs0.html
   [165]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-kbl6/igt@gem_exec_fair@basic-none@vcs0.html

  * igt@gem_huc_copy@huc-copy:
    - shard-tglb:         [SKIP][166] ([i915#2190]) -> [PASS][167]
   [166]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-tglb6/igt@gem_huc_copy@huc-copy.html
   [167]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-tglb8/igt@gem_huc_copy@huc-copy.html

  * igt@kms_cursor_legacy@cursor-vs-flip-toggle:
    - shard-iclb:         [FAIL][168] ([i915#5072]) -> [PASS][169]
   [168]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-iclb7/igt@kms_cursor_legacy@cursor-vs-flip-toggle.html
   [169]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb1/igt@kms_cursor_legacy@cursor-vs-flip-toggle.html

  * igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions-varying-size:
    - shard-glk:          [FAIL][170] ([i915#2346] / [i915#533]) -> [PASS][171]
   [170]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-glk4/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions-varying-size.html
   [171]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-glk7/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions-varying-size.html

  * igt@kms_flip@plain-flip-ts-check-interruptible@c-edp1:
    - shard-skl:          [FAIL][172] ([i915#2122]) -> [PASS][173]
   [172]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl6/igt@kms_flip@plain-flip-ts-check-interruptible@c-edp1.html
   [173]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl4/igt@kms_flip@plain-flip-ts-check-interruptible@c-edp1.html

  * igt@kms_flip_scaled_crc@flip-32bpp-ytile-to-32bpp-ytileccs-upscaling:
    - shard-glk:          [FAIL][174] ([i915#4911]) -> [PASS][175]
   [174]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-glk8/igt@kms_flip_scaled_crc@flip-32bpp-ytile-to-32bpp-ytileccs-upscaling.html
   [175]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-glk5/igt@kms_flip_scaled_crc@flip-32bpp-ytile-to-32bpp-ytileccs-upscaling.html

  * igt@kms_flip_scaled_crc@flip-64bpp-ytile-to-16bpp-ytile-downscaling:
    - shard-iclb:         [SKIP][176] ([i915#3701]) -> [PASS][177] +2 similar issues
   [176]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-iclb2/igt@kms_flip_scaled_crc@flip-64bpp-ytile-to-16bpp-ytile-downscaling.html
   [177]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb6/igt@kms_flip_scaled_crc@flip-64bpp-ytile-to-16bpp-ytile-downscaling.html

  * igt@kms_hdr@bpc-switch-dpms@pipe-a-dp-1:
    - shard-kbl:          [FAIL][178] ([i915#1188]) -> [PASS][179] +1 similar issue
   [178]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-kbl4/igt@kms_hdr@bpc-switch-dpms@pipe-a-dp-1.html
   [179]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-kbl3/igt@kms_hdr@bpc-switch-dpms@pipe-a-dp-1.html

  * igt@kms_hdr@bpc-switch-suspend@pipe-a-dp-1:
    - shard-apl:          [DMESG-WARN][180] ([i915#180]) -> [PASS][181] +3 similar issues
   [180]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl3/igt@kms_hdr@bpc-switch-suspend@pipe-a-dp-1.html
   [181]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl8/igt@kms_hdr@bpc-switch-suspend@pipe-a-dp-1.html

  * igt@kms_plane_scaling@planes-downscale-factor-0-5@pipe-a-edp-1:
    - shard-iclb:         [SKIP][182] ([i915#5235]) -> [PASS][183] +2 similar issues
   [182]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-iclb2/igt@kms_plane_scaling@planes-downscale-factor-0-5@pipe-a-edp-1.html
   [183]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb6/igt@kms_plane_scaling@planes-downscale-factor-0-5@pipe-a-edp-1.html

  * igt@kms_psr@psr2_dpms:
    - shard-iclb:         [SKIP][184] ([fdo#109441]) -> [PASS][185] +1 similar issue
   [184]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-iclb4/igt@kms_psr@psr2_dpms.html
   [185]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb2/igt@kms_psr@psr2_dpms.html

  * igt@perf@polling-parameterized:
    - shard-tglb:         [FAIL][186] ([i915#5639]) -> [PASS][187]
   [186]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-tglb8/igt@perf@polling-parameterized.html
   [187]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-tglb7/igt@perf@polling-parameterized.html

  * igt@sysfs_heartbeat_interval@mixed@vecs0:
    - shard-skl:          [FAIL][188] ([i915#1731]) -> [PASS][189]
   [188]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl3/igt@sysfs_heartbeat_interval@mixed@vecs0.html
   [189]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl6/igt@sysfs_heartbeat_interval@mixed@vecs0.html

  
#### Warnings ####

  * igt@gem_eio@unwedge-stress:
    - shard-tglb:         [FAIL][190] ([i915#5784]) -> [TIMEOUT][191] ([i915#3063])
   [190]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-tglb2/igt@gem_eio@unwedge-stress.html
   [191]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-tglb7/igt@gem_eio@unwedge-stress.html

  * igt@gem_exec_balancer@parallel-ordering:
    - shard-iclb:         [FAIL][192] ([i915#6117]) -> [SKIP][193] ([i915#4525])
   [192]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-iclb4/igt@gem_exec_balancer@parallel-ordering.html
   [193]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb5/igt@gem_exec_balancer@parallel-ordering.html

  * igt@kms_psr2_sf@cursor-plane-move-continuous-exceed-sf:
    - shard-iclb:         [SKIP][194] ([i915#658]) -> [SKIP][195] ([i915#2920])
   [194]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-iclb4/igt@kms_psr2_sf@cursor-plane-move-continuous-exceed-sf.html
   [195]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb2/igt@kms_psr2_sf@cursor-plane-move-continuous-exceed-sf.html

  * igt@kms_psr2_sf@overlay-plane-update-continuous-sf:
    - shard-iclb:         [SKIP][196] ([fdo#111068] / [i915#658]) -> [SKIP][197] ([i915#2920])
   [196]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-iclb5/igt@kms_psr2_sf@overlay-plane-update-continuous-sf.html
   [197]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb2/igt@kms_psr2_sf@overlay-plane-update-continuous-sf.html

  * igt@kms_psr2_sf@overlay-plane-update-sf-dmg-area:
    - shard-iclb:         [SKIP][198] ([i915#2920]) -> [SKIP][199] ([fdo#111068] / [i915#658])
   [198]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-iclb2/igt@kms_psr2_sf@overlay-plane-update-sf-dmg-area.html
   [199]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb6/igt@kms_psr2_sf@overlay-plane-update-sf-dmg-area.html

  * igt@kms_psr2_sf@primary-plane-update-sf-dmg-area-big-fb:
    - shard-iclb:         [SKIP][200] ([i915#2920]) -> [SKIP][201] ([i915#658]) +1 similar issue
   [200]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-iclb2/igt@kms_psr2_sf@primary-plane-update-sf-dmg-area-big-fb.html
   [201]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb6/igt@kms_psr2_sf@primary-plane-update-sf-dmg-area-big-fb.html

  * igt@kms_psr2_su@page_flip-nv12:
    - shard-iclb:         [SKIP][202] ([fdo#109642] / [fdo#111068] / [i915#658]) -> [FAIL][203] ([i915#5939])
   [202]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-iclb7/igt@kms_psr2_su@page_flip-nv12.html
   [203]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-iclb2/igt@kms_psr2_su@page_flip-nv12.html

  * igt@prime_nv_api@i915_nv_double_export:
    - shard-skl:          [SKIP][204] ([fdo#109271]) -> [SKIP][205] ([fdo#109271] / [i915#1888]) +2 similar issues
   [204]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-skl9/igt@prime_nv_api@i915_nv_double_export.html
   [205]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-skl2/igt@prime_nv_api@i915_nv_double_export.html

  * igt@runner@aborted:
    - shard-apl:          ([FAIL][206], [FAIL][207], [FAIL][208], [FAIL][209], [FAIL][210]) ([i915#180] / [i915#3002] / [i915#4312] / [i915#5257]) -> ([FAIL][211], [FAIL][212], [FAIL][213], [FAIL][214], [FAIL][215], [FAIL][216], [FAIL][217], [FAIL][218]) ([fdo#109271] / [i915#180] / [i915#3002] / [i915#4312] / [i915#5257])
   [206]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl8/igt@runner@aborted.html
   [207]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl1/igt@runner@aborted.html
   [208]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl3/igt@runner@aborted.html
   [209]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl8/igt@runner@aborted.html
   [210]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-apl2/igt@runner@aborted.html
   [211]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl3/igt@runner@aborted.html
   [212]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl6/igt@runner@aborted.html
   [213]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl1/igt@runner@aborted.html
   [214]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl7/igt@runner@aborted.html
   [215]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl3/igt@runner@aborted.html
   [216]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl4/igt@runner@aborted.html
   [217]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl7/igt@runner@aborted.html
   [218]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-apl6/igt@runner@aborted.html
    - shard-kbl:          ([FAIL][219], [FAIL][220]) ([i915#3002] / [i915#4312] / [i915#5257]) -> ([FAIL][221], [FAIL][222], [FAIL][223], [FAIL][224], [FAIL][225], [FAIL][226]) ([i915#180] / [i915#3002] / [i915#4312] / [i915#5257])
   [219]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-kbl7/igt@runner@aborted.html
   [220]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11773/shard-kbl3/igt@runner@aborted.html
   [221]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-kbl6/igt@runner@aborted.html
   [222]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-kbl6/igt@runner@aborted.html
   [223]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-kbl1/igt@runner@aborted.html
   [224]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-kbl7/igt@runner@aborted.html
   [225]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-kbl7/igt@runner@aborted.html
   [226]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/shard-kbl7/igt@runner@aborted.html

  
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109441]: https://bugs.freedesktop.org/show_bug.cgi?id=109441
  [fdo#109642]: https://bugs.freedesktop.org/show_bug.cgi?id=109642
  [fdo#111068]: https://bugs.freedesktop.org/show_bug.cgi?id=111068
  [fdo#111827]: https://bugs.freedesktop.org/show_bug.cgi?id=111827
  [i915#1188]: https://gitlab.freedesktop.org/drm/intel/issues/1188
  [i915#1722]: https://gitlab.freedesktop.org/drm/intel/issues/1722
  [i915#1731]: https://gitlab.freedesktop.org/drm/intel/issues/1731
  [i915#180]: https://gitlab.freedesktop.org/drm/intel/issues/180
  [i915#1888]: https://gitlab.freedesktop.org/drm/intel/issues/1888
  [i915#2122]: https://gitlab.freedesktop.org/drm/intel/issues/2122
  [i915#2190]: https://gitlab.freedesktop.org/drm/intel/issues/2190
  [i915#2346]: https://gitlab.freedesktop.org/drm/intel/issues/2346
  [i915#265]: https://gitlab.freedesktop.org/drm/intel/issues/265
  [i915#2842]: https://gitlab.freedesktop.org/drm/intel/issues/2842
  [i915#2920]: https://gitlab.freedesktop.org/drm/intel/issues/2920
  [i915#2994]: https://gitlab.freedesktop.org/drm/intel/issues/2994
  [i915#3002]: https://gitlab.freedesktop.org/drm/intel/issues/3002
  [i915#3063]: https://gitlab.freedesktop.org/drm/intel/issues/3063
  [i915#3323]: https://gitlab.freedesktop.org/drm/intel/issues/3323
  [i915#3614]: https://gitlab.freedesktop.org/drm/intel/issues/3614
  [i915#3701]: https://gitlab.freedesktop.org/drm/intel/issues/3701
  [i915#3886]: https://gitlab.freedesktop.org/drm/intel/issues/3886
  [i915#4312]: https://gitlab.freedesktop.org/drm/intel/issues/4312
  [i915#4386]: https://gitlab.freedesktop.org/drm/intel/issues/4386
  [i915#4525]: https://gitlab.freedesktop.org/drm/intel/issues/4525
  [i915#454]: https://gitlab.freedesktop.org/drm/intel/issues/454
  [i915#4613]: https://gitlab.freedesktop.org/drm/intel/issues/4613
  [i915#4793]: https://gitlab.freedesktop.org/drm/intel/issues/4793
  [i915#4911]: https://gitlab.freedesktop.org/drm/intel/issues/4911
  [i915#4991]: https://gitlab.freedesktop.org/drm/intel/issues/4991
  [i915#5032]: https://gitlab.freedesktop.org/drm/intel/issues/5032
  [i915#5072]: https://gitlab.freedesktop.org/drm/intel/issues/5072
  [i915#5176]: https://gitlab.freedesktop.org/drm/intel/issues/5176
  [i915#5235]: https://gitlab.freedesktop.org/drm/intel/issues/5235
  [i915#5257]: https://gitlab.freedesktop.org/drm/intel/issues/5257
  [i915#533]: https://gitlab.freedesktop.org/drm/intel/issues/533
  [i915#5566]: https://gitlab.freedesktop.org/drm/intel/issues/5566
  [i915#5639]: https://gitlab.freedesktop.org/drm/intel/issues/5639
  [i915#5784]: https://gitlab.freedesktop.org/drm/intel/issues/5784
  [i915#5939]: https://gitlab.freedesktop.org/drm/intel/issues/5939
  [i915#6117]: https://gitlab.freedesktop.org/drm/intel/issues/6117
  [i915#658]: https://gitlab.freedesktop.org/drm/intel/issues/658
  [i915#716]: https://gitlab.freedesktop.org/drm/intel/issues/716
  [i915#79]: https://gitlab.freedesktop.org/drm/intel/issues/79


Build changes
-------------

  * Linux: CI_DRM_11773 -> Patchwork_105085v3

  CI-20190529: 20190529
  CI_DRM_11773: 8025a295b7aa707f64c7984b7781c6f25e22a901 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_6533: 6b5107d91827962808441db6b98e478aa9e67bdb @ https://gitlab.freedesktop.org/drm/igt-gpu-tools.git
  Patchwork_105085v3: 8025a295b7aa707f64c7984b7781c6f25e22a901 @ git://anongit.freedesktop.org/gfx-ci/linux
  piglit_4509: fdc5a4ca11124ab8413c7988896eec4c97336694 @ git://anongit.freedesktop.org/piglit

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_105085v3/index.html

[-- Attachment #2: Type: text/html, Size: 45369 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-06-17  8:00 ` [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness Tvrtko Ursulin
@ 2022-07-27  6:01   ` Umesh Nerlige Ramappa
  2022-07-27  8:48     ` Tvrtko Ursulin
  0 siblings, 1 reply; 29+ messages in thread
From: Umesh Nerlige Ramappa @ 2022-07-27  6:01 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Fri, Jun 17, 2022 at 09:00:06AM +0100, Tvrtko Ursulin wrote:
>
>On 16/06/2022 23:13, Nerlige Ramappa, Umesh wrote:
>>From: John Harrison <John.C.Harrison@Intel.com>
>>
>>GuC provides engine_id and last_switch_in ticks for an active context in
>>the pphwsp. The context image provides a 32 bit total ticks which is the
>>accumulated by the context (a.k.a. context[CTX_TIMESTAMP]). This
>>information is used to calculate the context busyness as follows:
>>
>>If the engine_id is valid, then busyness is the sum of accumulated total
>>ticks and active ticks. Active ticks is calculated with current gt time
>>as reference.
>>
>>If engine_id is invalid, busyness is equal to accumulated total ticks.
>>
>>Since KMD (CPU) retrieves busyness data from 2 sources - GPU and GuC, a
>>potential race was highlighted in an earlier review that can lead to
>>double accounting of busyness. While the solution to this is a wip,
>>busyness is still usable for platforms running GuC submission.
>>
>>v2: (Tvrtko)
>>- Use COPS_RUNTIME_ACTIVE_TOTAL
>>- Add code comment for the race
>>- Undo local variables initializations
>>
>>v3:
>>- Add support for virtual engines based on
>>   https://patchwork.freedesktop.org/series/105227/
>>
>>Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>---
>>  drivers/gpu/drm/i915/gt/intel_context.c       | 12 +++-
>>  drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
>>  drivers/gpu/drm/i915/gt/intel_context_types.h |  6 ++
>>  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
>>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 65 ++++++++++++++++++-
>>  drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
>>  6 files changed, 89 insertions(+), 11 deletions(-)
>>
>>diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
>>index 4070cb5711d8..4a84146710e0 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_context.c
>>+++ b/drivers/gpu/drm/i915/gt/intel_context.c
>>@@ -576,16 +576,24 @@ void intel_context_bind_parent_child(struct intel_context *parent,
>>  	child->parallel.parent = parent;
>>  }
>>-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
>>+u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>>  {
>>  	u64 total, active;
>>+	if (ce->ops->update_stats)
>>+		ce->ops->update_stats(ce);
>>+
>>  	total = ce->stats.runtime.total;
>>  	if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>>  		total *= ce->engine->gt->clock_period_ns;
>>  	active = READ_ONCE(ce->stats.active);
>>-	if (active)
>>+	/*
>>+	 * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the backend
>>+	 * already provides the total active time of the context, so skip this
>>+	 * calculation when this flag is set.
>>+	 */
>>+	if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
>>  		active = intel_context_clock() - active;
>>  	return total + active;
>>diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
>>index b7d3214d2cdd..5fc7c19ab29b 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_context.h
>>+++ b/drivers/gpu/drm/i915/gt/intel_context.h
>>@@ -56,7 +56,7 @@ static inline bool intel_context_is_parent(struct intel_context *ce)
>>  	return !!ce->parallel.number_children;
>>  }
>>-static inline bool intel_context_is_pinned(struct intel_context *ce);
>>+static inline bool intel_context_is_pinned(const struct intel_context *ce);
>>  static inline struct intel_context *
>>  intel_context_to_parent(struct intel_context *ce)
>>@@ -116,7 +116,7 @@ static inline int intel_context_lock_pinned(struct intel_context *ce)
>>   * Returns: true if the context is currently pinned for use by the GPU.
>>   */
>>  static inline bool
>>-intel_context_is_pinned(struct intel_context *ce)
>>+intel_context_is_pinned(const struct intel_context *ce)
>>  {
>>  	return atomic_read(&ce->pin_count);
>>  }
>>@@ -351,7 +351,7 @@ intel_context_clear_nopreempt(struct intel_context *ce)
>>  	clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
>>  }
>>-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce);
>>+u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
>>  u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
>>  static inline u64 intel_context_clock(void)
>>diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>index 09f82545789f..797bb4242c18 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
>>+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>@@ -38,6 +38,9 @@ struct intel_context_ops {
>>  #define COPS_RUNTIME_CYCLES_BIT 1
>>  #define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT)
>>+#define COPS_RUNTIME_ACTIVE_TOTAL_BIT 2
>>+#define COPS_RUNTIME_ACTIVE_TOTAL BIT(COPS_RUNTIME_ACTIVE_TOTAL_BIT)
>>+
>>  	int (*alloc)(struct intel_context *ce);
>>  	void (*ban)(struct intel_context *ce, struct i915_request *rq);
>>@@ -55,6 +58,8 @@ struct intel_context_ops {
>>  	void (*sched_disable)(struct intel_context *ce);
>>+	void (*update_stats)(struct intel_context *ce);
>>+
>>  	void (*reset)(struct intel_context *ce);
>>  	void (*destroy)(struct kref *kref);
>>@@ -146,6 +151,7 @@ struct intel_context {
>>  			struct ewma_runtime avg;
>>  			u64 total;
>>  			u32 last;
>>+			u64 start_gt_clk;
>>  			I915_SELFTEST_DECLARE(u32 num_underflow);
>>  			I915_SELFTEST_DECLARE(u32 max_underflow);
>>  		} runtime;
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>index b3c9a9327f76..6231ad03e4eb 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>@@ -196,6 +196,11 @@ static inline u8 guc_class_to_engine_class(u8 guc_class)
>>  	return guc_class_engine_class_map[guc_class];
>>  }
>>+/* Per context engine usage stats: */
>>+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO	(0x500 / sizeof(u32))
>>+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO + 1)
>>+#define PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI + 1)
>>+
>>  /* Work item for submitting workloads into work queue of GuC. */
>>  struct guc_wq_item {
>>  	u32 header;
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>index 5a1dfacf24ea..cbf3cbb983ce 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>@@ -378,7 +378,7 @@ static inline void set_context_guc_id_invalid(struct intel_context *ce)
>>  	ce->guc_id.id = GUC_INVALID_CONTEXT_ID;
>>  }
>>-static inline struct intel_guc *ce_to_guc(struct intel_context *ce)
>>+static inline struct intel_guc *ce_to_guc(const struct intel_context *ce)
>>  {
>>  	return &ce->engine->gt->uc.guc;
>>  }
>>@@ -1323,13 +1323,16 @@ static void __update_guc_busyness_stats(struct intel_guc *guc)
>>  	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>  }
>>+static void __guc_context_update_clks(struct intel_context *ce);
>>  static void guc_timestamp_ping(struct work_struct *wrk)
>>  {
>>  	struct intel_guc *guc = container_of(wrk, typeof(*guc),
>>  					     timestamp.work.work);
>>  	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
>>  	struct intel_gt *gt = guc_to_gt(guc);
>>+	struct intel_context *ce;
>>  	intel_wakeref_t wakeref;
>>+	unsigned long index;
>>  	int srcu, ret;
>>  	/*
>>@@ -1343,6 +1346,10 @@ static void guc_timestamp_ping(struct work_struct *wrk)
>>  	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>>  		__update_guc_busyness_stats(guc);
>>+	/* adjust context stats for overflow */
>>+	xa_for_each(&guc->context_lookup, index, ce)
>>+		__guc_context_update_clks(ce);
>>+
>>  	intel_gt_reset_unlock(gt, srcu);
>>  	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>@@ -1405,6 +1412,56 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
>>  			 guc->timestamp.ping_delay);
>>  }
>>+static void __guc_context_update_clks(struct intel_context *ce)
>>+{
>>+	struct intel_guc *guc = ce_to_guc(ce);
>>+	struct intel_gt *gt = ce->engine->gt;
>>+	u32 *pphwsp, last_switch, engine_id;
>>+	u64 start_gt_clk, active;
>>+	unsigned long flags;
>>+	ktime_t unused;
>>+
>>+	spin_lock_irqsave(&guc->timestamp.lock, flags);
>>+
>>+	/*
>>+	 * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context is switched
>>+	 * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
>>+	 * relies on GuC and GPU for busyness calculations. Due to this, A
>>+	 * potential race was highlighted in an earlier review that can lead to
>>+	 * double accounting of busyness. While the solution to this is a wip,
>>+	 * busyness is still usable for platforms running GuC submission.
>>+	 */
>>+	pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
>>+	last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
>>+	engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
>>+
>>+	guc_update_pm_timestamp(guc, &unused);
>>+
>>+	if (engine_id != 0xffffffff && last_switch) {
>>+		start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
>>+		__extend_last_switch(guc, &start_gt_clk, last_switch);
>>+		active = intel_gt_clock_interval_to_ns(gt, guc->timestamp.gt_stamp - start_gt_clk);
>>+		WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
>>+		WRITE_ONCE(ce->stats.active, active);
>>+	} else {
>>+		lrc_update_runtime(ce);
>>+	}
>>+
>>+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>+}
>>+
>>+static void guc_context_update_stats(struct intel_context *ce)
>>+{
>>+	if (!intel_context_pin_if_active(ce)) {
>>+		WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
>>+		WRITE_ONCE(ce->stats.active, 0);
>>+		return;
>>+	}
>>+
>>+	__guc_context_update_clks(ce);
>>+	intel_context_unpin(ce);
>>+}
>>+
>>  static inline bool
>>  submission_disabled(struct intel_guc *guc)
>>  {
>>@@ -2585,6 +2642,7 @@ static void guc_context_unpin(struct intel_context *ce)
>>  {
>>  	struct intel_guc *guc = ce_to_guc(ce);
>>+	lrc_update_runtime(ce);
>>  	unpin_guc_id(guc, ce);
>>  	lrc_unpin(ce);
>>@@ -3183,6 +3241,7 @@ static void remove_from_context(struct i915_request *rq)
>>  }
>>  static const struct intel_context_ops guc_context_ops = {
>>+	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>  	.alloc = guc_context_alloc,
>>  	.pre_pin = guc_context_pre_pin,
>>@@ -3199,6 +3258,8 @@ static const struct intel_context_ops guc_context_ops = {
>>  	.sched_disable = guc_context_sched_disable,
>>+	.update_stats = guc_context_update_stats,
>>+
>>  	.reset = lrc_reset,
>>  	.destroy = guc_context_destroy,
>>@@ -3432,6 +3493,7 @@ static int guc_virtual_context_alloc(struct intel_context *ce)
>>  }
>>  static const struct intel_context_ops virtual_guc_context_ops = {
>>+	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>  	.alloc = guc_virtual_context_alloc,
>>  	.pre_pin = guc_virtual_context_pre_pin,
>>@@ -3447,6 +3509,7 @@ static const struct intel_context_ops virtual_guc_context_ops = {
>>  	.exit = guc_virtual_context_exit,
>>  	.sched_disable = guc_context_sched_disable,
>>+	.update_stats = guc_context_update_stats,
>
>There are also virtual_parent_context_ops and 
>virtual_child_context_ops - which means more test coverage is needed..

Trying to come back to this... The 
virtual_parent_context_ops/virtual_child_context_ops are used for 
parallel engines. GuC would only update the pphwsp of the parent context 
with the last_switched_in_time.

In general, how should I report the busyness for a parallel engine?

I would think it is busyness reported by parent context multiplied by 
width.

Thanks,
Umesh

>
>Regards,
>
>Tvrtko
>
>>  	.destroy = guc_context_destroy,
>>diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c
>>index 18d38cb59923..118db6f03f15 100644
>>--- a/drivers/gpu/drm/i915/i915_drm_client.c
>>+++ b/drivers/gpu/drm/i915/i915_drm_client.c
>>@@ -146,11 +146,7 @@ void i915_drm_client_fdinfo(struct seq_file *m, struct file *f)
>>  		   PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
>>  	seq_printf(m, "drm-client-id:\t%u\n", client->id);
>>-	/*
>>-	 * Temporarily skip showing client engine information with GuC submission till
>>-	 * fetching engine busyness is implemented in the GuC submission backend
>>-	 */
>>-	if (GRAPHICS_VER(i915) < 8 || intel_uc_uses_guc_submission(&i915->gt0.uc))
>>+	if (GRAPHICS_VER(i915) < 8)
>>  		return;
>>  	for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-07-27  6:01   ` Umesh Nerlige Ramappa
@ 2022-07-27  8:48     ` Tvrtko Ursulin
  2022-08-01 19:02       ` Umesh Nerlige Ramappa
  0 siblings, 1 reply; 29+ messages in thread
From: Tvrtko Ursulin @ 2022-07-27  8:48 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx


On 27/07/2022 07:01, Umesh Nerlige Ramappa wrote:
> On Fri, Jun 17, 2022 at 09:00:06AM +0100, Tvrtko Ursulin wrote:
>>
>> On 16/06/2022 23:13, Nerlige Ramappa, Umesh wrote:
>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>
>>> GuC provides engine_id and last_switch_in ticks for an active context in
>>> the pphwsp. The context image provides a 32 bit total ticks which is the
>>> accumulated by the context (a.k.a. context[CTX_TIMESTAMP]). This
>>> information is used to calculate the context busyness as follows:
>>>
>>> If the engine_id is valid, then busyness is the sum of accumulated total
>>> ticks and active ticks. Active ticks is calculated with current gt time
>>> as reference.
>>>
>>> If engine_id is invalid, busyness is equal to accumulated total ticks.
>>>
>>> Since KMD (CPU) retrieves busyness data from 2 sources - GPU and GuC, a
>>> potential race was highlighted in an earlier review that can lead to
>>> double accounting of busyness. While the solution to this is a wip,
>>> busyness is still usable for platforms running GuC submission.
>>>
>>> v2: (Tvrtko)
>>> - Use COPS_RUNTIME_ACTIVE_TOTAL
>>> - Add code comment for the race
>>> - Undo local variables initializations
>>>
>>> v3:
>>> - Add support for virtual engines based on
>>>   https://patchwork.freedesktop.org/series/105227/
>>>
>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>> ---
>>>  drivers/gpu/drm/i915/gt/intel_context.c       | 12 +++-
>>>  drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
>>>  drivers/gpu/drm/i915/gt/intel_context_types.h |  6 ++
>>>  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
>>>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 65 ++++++++++++++++++-
>>>  drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
>>>  6 files changed, 89 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
>>> b/drivers/gpu/drm/i915/gt/intel_context.c
>>> index 4070cb5711d8..4a84146710e0 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_context.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
>>> @@ -576,16 +576,24 @@ void intel_context_bind_parent_child(struct 
>>> intel_context *parent,
>>>      child->parallel.parent = parent;
>>>  }
>>> -u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
>>> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>>>  {
>>>      u64 total, active;
>>> +    if (ce->ops->update_stats)
>>> +        ce->ops->update_stats(ce);
>>> +
>>>      total = ce->stats.runtime.total;
>>>      if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>>>          total *= ce->engine->gt->clock_period_ns;
>>>      active = READ_ONCE(ce->stats.active);
>>> -    if (active)
>>> +    /*
>>> +     * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the backend
>>> +     * already provides the total active time of the context, so 
>>> skip this
>>> +     * calculation when this flag is set.
>>> +     */
>>> +    if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
>>>          active = intel_context_clock() - active;
>>>      return total + active;
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
>>> b/drivers/gpu/drm/i915/gt/intel_context.h
>>> index b7d3214d2cdd..5fc7c19ab29b 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_context.h
>>> +++ b/drivers/gpu/drm/i915/gt/intel_context.h
>>> @@ -56,7 +56,7 @@ static inline bool intel_context_is_parent(struct 
>>> intel_context *ce)
>>>      return !!ce->parallel.number_children;
>>>  }
>>> -static inline bool intel_context_is_pinned(struct intel_context *ce);
>>> +static inline bool intel_context_is_pinned(const struct 
>>> intel_context *ce);
>>>  static inline struct intel_context *
>>>  intel_context_to_parent(struct intel_context *ce)
>>> @@ -116,7 +116,7 @@ static inline int 
>>> intel_context_lock_pinned(struct intel_context *ce)
>>>   * Returns: true if the context is currently pinned for use by the GPU.
>>>   */
>>>  static inline bool
>>> -intel_context_is_pinned(struct intel_context *ce)
>>> +intel_context_is_pinned(const struct intel_context *ce)
>>>  {
>>>      return atomic_read(&ce->pin_count);
>>>  }
>>> @@ -351,7 +351,7 @@ intel_context_clear_nopreempt(struct 
>>> intel_context *ce)
>>>      clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
>>>  }
>>> -u64 intel_context_get_total_runtime_ns(const struct intel_context *ce);
>>> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
>>>  u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
>>>  static inline u64 intel_context_clock(void)
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
>>> b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>> index 09f82545789f..797bb4242c18 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
>>> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>> @@ -38,6 +38,9 @@ struct intel_context_ops {
>>>  #define COPS_RUNTIME_CYCLES_BIT 1
>>>  #define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT)
>>> +#define COPS_RUNTIME_ACTIVE_TOTAL_BIT 2
>>> +#define COPS_RUNTIME_ACTIVE_TOTAL BIT(COPS_RUNTIME_ACTIVE_TOTAL_BIT)
>>> +
>>>      int (*alloc)(struct intel_context *ce);
>>>      void (*ban)(struct intel_context *ce, struct i915_request *rq);
>>> @@ -55,6 +58,8 @@ struct intel_context_ops {
>>>      void (*sched_disable)(struct intel_context *ce);
>>> +    void (*update_stats)(struct intel_context *ce);
>>> +
>>>      void (*reset)(struct intel_context *ce);
>>>      void (*destroy)(struct kref *kref);
>>> @@ -146,6 +151,7 @@ struct intel_context {
>>>              struct ewma_runtime avg;
>>>              u64 total;
>>>              u32 last;
>>> +            u64 start_gt_clk;
>>>              I915_SELFTEST_DECLARE(u32 num_underflow);
>>>              I915_SELFTEST_DECLARE(u32 max_underflow);
>>>          } runtime;
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h 
>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>> index b3c9a9327f76..6231ad03e4eb 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>> @@ -196,6 +196,11 @@ static inline u8 guc_class_to_engine_class(u8 
>>> guc_class)
>>>      return guc_class_engine_class_map[guc_class];
>>>  }
>>> +/* Per context engine usage stats: */
>>> +#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO    (0x500 / sizeof(u32))
>>> +#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI    
>>> (PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO + 1)
>>> +#define PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID    
>>> (PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI + 1)
>>> +
>>>  /* Work item for submitting workloads into work queue of GuC. */
>>>  struct guc_wq_item {
>>>      u32 header;
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> index 5a1dfacf24ea..cbf3cbb983ce 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> @@ -378,7 +378,7 @@ static inline void 
>>> set_context_guc_id_invalid(struct intel_context *ce)
>>>      ce->guc_id.id = GUC_INVALID_CONTEXT_ID;
>>>  }
>>> -static inline struct intel_guc *ce_to_guc(struct intel_context *ce)
>>> +static inline struct intel_guc *ce_to_guc(const struct intel_context 
>>> *ce)
>>>  {
>>>      return &ce->engine->gt->uc.guc;
>>>  }
>>> @@ -1323,13 +1323,16 @@ static void 
>>> __update_guc_busyness_stats(struct intel_guc *guc)
>>>      spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>>  }
>>> +static void __guc_context_update_clks(struct intel_context *ce);
>>>  static void guc_timestamp_ping(struct work_struct *wrk)
>>>  {
>>>      struct intel_guc *guc = container_of(wrk, typeof(*guc),
>>>                           timestamp.work.work);
>>>      struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
>>>      struct intel_gt *gt = guc_to_gt(guc);
>>> +    struct intel_context *ce;
>>>      intel_wakeref_t wakeref;
>>> +    unsigned long index;
>>>      int srcu, ret;
>>>      /*
>>> @@ -1343,6 +1346,10 @@ static void guc_timestamp_ping(struct 
>>> work_struct *wrk)
>>>      with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>>>          __update_guc_busyness_stats(guc);
>>> +    /* adjust context stats for overflow */
>>> +    xa_for_each(&guc->context_lookup, index, ce)
>>> +        __guc_context_update_clks(ce);
>>> +
>>>      intel_gt_reset_unlock(gt, srcu);
>>>      mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>> @@ -1405,6 +1412,56 @@ void intel_guc_busyness_unpark(struct intel_gt 
>>> *gt)
>>>               guc->timestamp.ping_delay);
>>>  }
>>> +static void __guc_context_update_clks(struct intel_context *ce)
>>> +{
>>> +    struct intel_guc *guc = ce_to_guc(ce);
>>> +    struct intel_gt *gt = ce->engine->gt;
>>> +    u32 *pphwsp, last_switch, engine_id;
>>> +    u64 start_gt_clk, active;
>>> +    unsigned long flags;
>>> +    ktime_t unused;
>>> +
>>> +    spin_lock_irqsave(&guc->timestamp.lock, flags);
>>> +
>>> +    /*
>>> +     * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context is 
>>> switched
>>> +     * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
>>> +     * relies on GuC and GPU for busyness calculations. Due to this, A
>>> +     * potential race was highlighted in an earlier review that can 
>>> lead to
>>> +     * double accounting of busyness. While the solution to this is 
>>> a wip,
>>> +     * busyness is still usable for platforms running GuC submission.
>>> +     */
>>> +    pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
>>> +    last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
>>> +    engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
>>> +
>>> +    guc_update_pm_timestamp(guc, &unused);
>>> +
>>> +    if (engine_id != 0xffffffff && last_switch) {
>>> +        start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
>>> +        __extend_last_switch(guc, &start_gt_clk, last_switch);
>>> +        active = intel_gt_clock_interval_to_ns(gt, 
>>> guc->timestamp.gt_stamp - start_gt_clk);
>>> +        WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
>>> +        WRITE_ONCE(ce->stats.active, active);
>>> +    } else {
>>> +        lrc_update_runtime(ce);
>>> +    }
>>> +
>>> +    spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>> +}
>>> +
>>> +static void guc_context_update_stats(struct intel_context *ce)
>>> +{
>>> +    if (!intel_context_pin_if_active(ce)) {
>>> +        WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
>>> +        WRITE_ONCE(ce->stats.active, 0);
>>> +        return;
>>> +    }
>>> +
>>> +    __guc_context_update_clks(ce);
>>> +    intel_context_unpin(ce);
>>> +}
>>> +
>>>  static inline bool
>>>  submission_disabled(struct intel_guc *guc)
>>>  {
>>> @@ -2585,6 +2642,7 @@ static void guc_context_unpin(struct 
>>> intel_context *ce)
>>>  {
>>>      struct intel_guc *guc = ce_to_guc(ce);
>>> +    lrc_update_runtime(ce);
>>>      unpin_guc_id(guc, ce);
>>>      lrc_unpin(ce);
>>> @@ -3183,6 +3241,7 @@ static void remove_from_context(struct 
>>> i915_request *rq)
>>>  }
>>>  static const struct intel_context_ops guc_context_ops = {
>>> +    .flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>>      .alloc = guc_context_alloc,
>>>      .pre_pin = guc_context_pre_pin,
>>> @@ -3199,6 +3258,8 @@ static const struct intel_context_ops 
>>> guc_context_ops = {
>>>      .sched_disable = guc_context_sched_disable,
>>> +    .update_stats = guc_context_update_stats,
>>> +
>>>      .reset = lrc_reset,
>>>      .destroy = guc_context_destroy,
>>> @@ -3432,6 +3493,7 @@ static int guc_virtual_context_alloc(struct 
>>> intel_context *ce)
>>>  }
>>>  static const struct intel_context_ops virtual_guc_context_ops = {
>>> +    .flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>>      .alloc = guc_virtual_context_alloc,
>>>      .pre_pin = guc_virtual_context_pre_pin,
>>> @@ -3447,6 +3509,7 @@ static const struct intel_context_ops 
>>> virtual_guc_context_ops = {
>>>      .exit = guc_virtual_context_exit,
>>>      .sched_disable = guc_context_sched_disable,
>>> +    .update_stats = guc_context_update_stats,
>>
>> There are also virtual_parent_context_ops and 
>> virtual_child_context_ops - which means more test coverage is needed..
> 
> Trying to come back to this... The 
> virtual_parent_context_ops/virtual_child_context_ops are used for 
> parallel engines. GuC would only update the pphwsp of the parent context 
> with the last_switched_in_time.
> 
> In general, how should I report the busyness for a parallel engine?
> 
> I would think it is busyness reported by parent context multiplied by 
> width.

That could a reasonable approximation but I can't say for certain. 
Depends on the GuC scheduler implementation a bit. Like is anything 
preventing child contexts from finishing their useful work ahead of the 
parent context, or they are always strictly scheduled as one entity and 
child engines are blocked from taking other workloads until the parent 
is scheduled out?

Regards,

Tvrtko

> Thanks,
> Umesh
> 
>>
>> Regards,
>>
>> Tvrtko
>>
>>>      .destroy = guc_context_destroy,
>>> diff --git a/drivers/gpu/drm/i915/i915_drm_client.c 
>>> b/drivers/gpu/drm/i915/i915_drm_client.c
>>> index 18d38cb59923..118db6f03f15 100644
>>> --- a/drivers/gpu/drm/i915/i915_drm_client.c
>>> +++ b/drivers/gpu/drm/i915/i915_drm_client.c
>>> @@ -146,11 +146,7 @@ void i915_drm_client_fdinfo(struct seq_file *m, 
>>> struct file *f)
>>>             PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
>>>      seq_printf(m, "drm-client-id:\t%u\n", client->id);
>>> -    /*
>>> -     * Temporarily skip showing client engine information with GuC 
>>> submission till
>>> -     * fetching engine busyness is implemented in the GuC submission 
>>> backend
>>> -     */
>>> -    if (GRAPHICS_VER(i915) < 8 || 
>>> intel_uc_uses_guc_submission(&i915->gt0.uc))
>>> +    if (GRAPHICS_VER(i915) < 8)
>>>          return;
>>>      for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-07-27  8:48     ` Tvrtko Ursulin
@ 2022-08-01 19:02       ` Umesh Nerlige Ramappa
  2022-08-02  8:41         ` Tvrtko Ursulin
  0 siblings, 1 reply; 29+ messages in thread
From: Umesh Nerlige Ramappa @ 2022-08-01 19:02 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Wed, Jul 27, 2022 at 09:48:18AM +0100, Tvrtko Ursulin wrote:
>
>On 27/07/2022 07:01, Umesh Nerlige Ramappa wrote:
>>On Fri, Jun 17, 2022 at 09:00:06AM +0100, Tvrtko Ursulin wrote:
>>>
>>>On 16/06/2022 23:13, Nerlige Ramappa, Umesh wrote:
>>>>From: John Harrison <John.C.Harrison@Intel.com>
>>>>
>>>>GuC provides engine_id and last_switch_in ticks for an active context in
>>>>the pphwsp. The context image provides a 32 bit total ticks which is the
>>>>accumulated by the context (a.k.a. context[CTX_TIMESTAMP]). This
>>>>information is used to calculate the context busyness as follows:
>>>>
>>>>If the engine_id is valid, then busyness is the sum of accumulated total
>>>>ticks and active ticks. Active ticks is calculated with current gt time
>>>>as reference.
>>>>
>>>>If engine_id is invalid, busyness is equal to accumulated total ticks.
>>>>
>>>>Since KMD (CPU) retrieves busyness data from 2 sources - GPU and GuC, a
>>>>potential race was highlighted in an earlier review that can lead to
>>>>double accounting of busyness. While the solution to this is a wip,
>>>>busyness is still usable for platforms running GuC submission.
>>>>
>>>>v2: (Tvrtko)
>>>>- Use COPS_RUNTIME_ACTIVE_TOTAL
>>>>- Add code comment for the race
>>>>- Undo local variables initializations
>>>>
>>>>v3:
>>>>- Add support for virtual engines based on
>>>>  https://patchwork.freedesktop.org/series/105227/
>>>>
>>>>Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>>>---
>>>> drivers/gpu/drm/i915/gt/intel_context.c       | 12 +++-
>>>> drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
>>>> drivers/gpu/drm/i915/gt/intel_context_types.h |  6 ++
>>>> drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
>>>> .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 65 ++++++++++++++++++-
>>>> drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
>>>> 6 files changed, 89 insertions(+), 11 deletions(-)
>>>>
>>>>diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
>>>>b/drivers/gpu/drm/i915/gt/intel_context.c
>>>>index 4070cb5711d8..4a84146710e0 100644
>>>>--- a/drivers/gpu/drm/i915/gt/intel_context.c
>>>>+++ b/drivers/gpu/drm/i915/gt/intel_context.c
>>>>@@ -576,16 +576,24 @@ void 
>>>>intel_context_bind_parent_child(struct intel_context *parent,
>>>>     child->parallel.parent = parent;
>>>> }
>>>>-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
>>>>+u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>>>> {
>>>>     u64 total, active;
>>>>+    if (ce->ops->update_stats)
>>>>+        ce->ops->update_stats(ce);
>>>>+
>>>>     total = ce->stats.runtime.total;
>>>>     if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>>>>         total *= ce->engine->gt->clock_period_ns;
>>>>     active = READ_ONCE(ce->stats.active);
>>>>-    if (active)
>>>>+    /*
>>>>+     * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the backend
>>>>+     * already provides the total active time of the context, 
>>>>so skip this
>>>>+     * calculation when this flag is set.
>>>>+     */
>>>>+    if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
>>>>         active = intel_context_clock() - active;
>>>>     return total + active;
>>>>diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
>>>>b/drivers/gpu/drm/i915/gt/intel_context.h
>>>>index b7d3214d2cdd..5fc7c19ab29b 100644
>>>>--- a/drivers/gpu/drm/i915/gt/intel_context.h
>>>>+++ b/drivers/gpu/drm/i915/gt/intel_context.h
>>>>@@ -56,7 +56,7 @@ static inline bool 
>>>>intel_context_is_parent(struct intel_context *ce)
>>>>     return !!ce->parallel.number_children;
>>>> }
>>>>-static inline bool intel_context_is_pinned(struct intel_context *ce);
>>>>+static inline bool intel_context_is_pinned(const struct 
>>>>intel_context *ce);
>>>> static inline struct intel_context *
>>>> intel_context_to_parent(struct intel_context *ce)
>>>>@@ -116,7 +116,7 @@ static inline int 
>>>>intel_context_lock_pinned(struct intel_context *ce)
>>>>  * Returns: true if the context is currently pinned for use by the GPU.
>>>>  */
>>>> static inline bool
>>>>-intel_context_is_pinned(struct intel_context *ce)
>>>>+intel_context_is_pinned(const struct intel_context *ce)
>>>> {
>>>>     return atomic_read(&ce->pin_count);
>>>> }
>>>>@@ -351,7 +351,7 @@ intel_context_clear_nopreempt(struct 
>>>>intel_context *ce)
>>>>     clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
>>>> }
>>>>-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce);
>>>>+u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
>>>> u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
>>>> static inline u64 intel_context_clock(void)
>>>>diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
>>>>b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>index 09f82545789f..797bb4242c18 100644
>>>>--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>@@ -38,6 +38,9 @@ struct intel_context_ops {
>>>> #define COPS_RUNTIME_CYCLES_BIT 1
>>>> #define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT)
>>>>+#define COPS_RUNTIME_ACTIVE_TOTAL_BIT 2
>>>>+#define COPS_RUNTIME_ACTIVE_TOTAL BIT(COPS_RUNTIME_ACTIVE_TOTAL_BIT)
>>>>+
>>>>     int (*alloc)(struct intel_context *ce);
>>>>     void (*ban)(struct intel_context *ce, struct i915_request *rq);
>>>>@@ -55,6 +58,8 @@ struct intel_context_ops {
>>>>     void (*sched_disable)(struct intel_context *ce);
>>>>+    void (*update_stats)(struct intel_context *ce);
>>>>+
>>>>     void (*reset)(struct intel_context *ce);
>>>>     void (*destroy)(struct kref *kref);
>>>>@@ -146,6 +151,7 @@ struct intel_context {
>>>>             struct ewma_runtime avg;
>>>>             u64 total;
>>>>             u32 last;
>>>>+            u64 start_gt_clk;
>>>>             I915_SELFTEST_DECLARE(u32 num_underflow);
>>>>             I915_SELFTEST_DECLARE(u32 max_underflow);
>>>>         } runtime;
>>>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h 
>>>>b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>index b3c9a9327f76..6231ad03e4eb 100644
>>>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>@@ -196,6 +196,11 @@ static inline u8 
>>>>guc_class_to_engine_class(u8 guc_class)
>>>>     return guc_class_engine_class_map[guc_class];
>>>> }
>>>>+/* Per context engine usage stats: */
>>>>+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO    (0x500 / sizeof(u32))
>>>>+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI    
>>>>(PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO + 1)
>>>>+#define PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID    
>>>>(PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI + 1)
>>>>+
>>>> /* Work item for submitting workloads into work queue of GuC. */
>>>> struct guc_wq_item {
>>>>     u32 header;
>>>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
>>>>b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>index 5a1dfacf24ea..cbf3cbb983ce 100644
>>>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>@@ -378,7 +378,7 @@ static inline void 
>>>>set_context_guc_id_invalid(struct intel_context *ce)
>>>>     ce->guc_id.id = GUC_INVALID_CONTEXT_ID;
>>>> }
>>>>-static inline struct intel_guc *ce_to_guc(struct intel_context *ce)
>>>>+static inline struct intel_guc *ce_to_guc(const struct 
>>>>intel_context *ce)
>>>> {
>>>>     return &ce->engine->gt->uc.guc;
>>>> }
>>>>@@ -1323,13 +1323,16 @@ static void 
>>>>__update_guc_busyness_stats(struct intel_guc *guc)
>>>>     spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>>> }
>>>>+static void __guc_context_update_clks(struct intel_context *ce);
>>>> static void guc_timestamp_ping(struct work_struct *wrk)
>>>> {
>>>>     struct intel_guc *guc = container_of(wrk, typeof(*guc),
>>>>                          timestamp.work.work);
>>>>     struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
>>>>     struct intel_gt *gt = guc_to_gt(guc);
>>>>+    struct intel_context *ce;
>>>>     intel_wakeref_t wakeref;
>>>>+    unsigned long index;
>>>>     int srcu, ret;
>>>>     /*
>>>>@@ -1343,6 +1346,10 @@ static void guc_timestamp_ping(struct 
>>>>work_struct *wrk)
>>>>     with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>>>>         __update_guc_busyness_stats(guc);
>>>>+    /* adjust context stats for overflow */
>>>>+    xa_for_each(&guc->context_lookup, index, ce)
>>>>+        __guc_context_update_clks(ce);
>>>>+
>>>>     intel_gt_reset_unlock(gt, srcu);
>>>>     mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>>>@@ -1405,6 +1412,56 @@ void intel_guc_busyness_unpark(struct 
>>>>intel_gt *gt)
>>>>              guc->timestamp.ping_delay);
>>>> }
>>>>+static void __guc_context_update_clks(struct intel_context *ce)
>>>>+{
>>>>+    struct intel_guc *guc = ce_to_guc(ce);
>>>>+    struct intel_gt *gt = ce->engine->gt;
>>>>+    u32 *pphwsp, last_switch, engine_id;
>>>>+    u64 start_gt_clk, active;
>>>>+    unsigned long flags;
>>>>+    ktime_t unused;
>>>>+
>>>>+    spin_lock_irqsave(&guc->timestamp.lock, flags);
>>>>+
>>>>+    /*
>>>>+     * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when 
>>>>context is switched
>>>>+     * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
>>>>+     * relies on GuC and GPU for busyness calculations. Due to this, A
>>>>+     * potential race was highlighted in an earlier review that 
>>>>can lead to
>>>>+     * double accounting of busyness. While the solution to 
>>>>this is a wip,
>>>>+     * busyness is still usable for platforms running GuC submission.
>>>>+     */
>>>>+    pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
>>>>+    last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
>>>>+    engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
>>>>+
>>>>+    guc_update_pm_timestamp(guc, &unused);
>>>>+
>>>>+    if (engine_id != 0xffffffff && last_switch) {
>>>>+        start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
>>>>+        __extend_last_switch(guc, &start_gt_clk, last_switch);
>>>>+        active = intel_gt_clock_interval_to_ns(gt, 
>>>>guc->timestamp.gt_stamp - start_gt_clk);
>>>>+        WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
>>>>+        WRITE_ONCE(ce->stats.active, active);
>>>>+    } else {
>>>>+        lrc_update_runtime(ce);
>>>>+    }
>>>>+
>>>>+    spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>>>+}
>>>>+
>>>>+static void guc_context_update_stats(struct intel_context *ce)
>>>>+{
>>>>+    if (!intel_context_pin_if_active(ce)) {
>>>>+        WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
>>>>+        WRITE_ONCE(ce->stats.active, 0);
>>>>+        return;
>>>>+    }
>>>>+
>>>>+    __guc_context_update_clks(ce);
>>>>+    intel_context_unpin(ce);
>>>>+}
>>>>+
>>>> static inline bool
>>>> submission_disabled(struct intel_guc *guc)
>>>> {
>>>>@@ -2585,6 +2642,7 @@ static void guc_context_unpin(struct 
>>>>intel_context *ce)
>>>> {
>>>>     struct intel_guc *guc = ce_to_guc(ce);
>>>>+    lrc_update_runtime(ce);
>>>>     unpin_guc_id(guc, ce);
>>>>     lrc_unpin(ce);
>>>>@@ -3183,6 +3241,7 @@ static void remove_from_context(struct 
>>>>i915_request *rq)
>>>> }
>>>> static const struct intel_context_ops guc_context_ops = {
>>>>+    .flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>>>     .alloc = guc_context_alloc,
>>>>     .pre_pin = guc_context_pre_pin,
>>>>@@ -3199,6 +3258,8 @@ static const struct intel_context_ops 
>>>>guc_context_ops = {
>>>>     .sched_disable = guc_context_sched_disable,
>>>>+    .update_stats = guc_context_update_stats,
>>>>+
>>>>     .reset = lrc_reset,
>>>>     .destroy = guc_context_destroy,
>>>>@@ -3432,6 +3493,7 @@ static int 
>>>>guc_virtual_context_alloc(struct intel_context *ce)
>>>> }
>>>> static const struct intel_context_ops virtual_guc_context_ops = {
>>>>+    .flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>>>     .alloc = guc_virtual_context_alloc,
>>>>     .pre_pin = guc_virtual_context_pre_pin,
>>>>@@ -3447,6 +3509,7 @@ static const struct intel_context_ops 
>>>>virtual_guc_context_ops = {
>>>>     .exit = guc_virtual_context_exit,
>>>>     .sched_disable = guc_context_sched_disable,
>>>>+    .update_stats = guc_context_update_stats,
>>>
>>>There are also virtual_parent_context_ops and 
>>>virtual_child_context_ops - which means more test coverage is 
>>>needed..
>>
>>Trying to come back to this... The 
>>virtual_parent_context_ops/virtual_child_context_ops are used for 
>>parallel engines. GuC would only update the pphwsp of the parent 
>>context with the last_switched_in_time.
>>
>>In general, how should I report the busyness for a parallel engine?
>>
>>I would think it is busyness reported by parent context multiplied 
>>by width.
>
>That could a reasonable approximation but I can't say for certain. 
>Depends on the GuC scheduler implementation a bit. Like is anything 
>preventing child contexts from finishing their useful work ahead of 
>the parent context, or they are always strictly scheduled as one 
>entity and child engines are blocked from taking other workloads until 
>the parent is scheduled out?

Correct, if a child finishes the work before parent/siblings for some 
reason, it cannot take up other work until all siblings are done.

Regards,
Umesh

>
>Regards,
>
>Tvrtko
>
>>Thanks,
>>Umesh
>>
>>>
>>>Regards,
>>>
>>>Tvrtko
>>>
>>>>     .destroy = guc_context_destroy,
>>>>diff --git a/drivers/gpu/drm/i915/i915_drm_client.c 
>>>>b/drivers/gpu/drm/i915/i915_drm_client.c
>>>>index 18d38cb59923..118db6f03f15 100644
>>>>--- a/drivers/gpu/drm/i915/i915_drm_client.c
>>>>+++ b/drivers/gpu/drm/i915/i915_drm_client.c
>>>>@@ -146,11 +146,7 @@ void i915_drm_client_fdinfo(struct seq_file 
>>>>*m, struct file *f)
>>>>            PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
>>>>     seq_printf(m, "drm-client-id:\t%u\n", client->id);
>>>>-    /*
>>>>-     * Temporarily skip showing client engine information with 
>>>>GuC submission till
>>>>-     * fetching engine busyness is implemented in the GuC 
>>>>submission backend
>>>>-     */
>>>>-    if (GRAPHICS_VER(i915) < 8 || 
>>>>intel_uc_uses_guc_submission(&i915->gt0.uc))
>>>>+    if (GRAPHICS_VER(i915) < 8)
>>>>         return;
>>>>     for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-08-01 19:02       ` Umesh Nerlige Ramappa
@ 2022-08-02  8:41         ` Tvrtko Ursulin
  2022-08-02 23:38           ` Umesh Nerlige Ramappa
  0 siblings, 1 reply; 29+ messages in thread
From: Tvrtko Ursulin @ 2022-08-02  8:41 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx


On 01/08/2022 20:02, Umesh Nerlige Ramappa wrote:
> On Wed, Jul 27, 2022 at 09:48:18AM +0100, Tvrtko Ursulin wrote:
>>
>> On 27/07/2022 07:01, Umesh Nerlige Ramappa wrote:
>>> On Fri, Jun 17, 2022 at 09:00:06AM +0100, Tvrtko Ursulin wrote:
>>>>
>>>> On 16/06/2022 23:13, Nerlige Ramappa, Umesh wrote:
>>>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>>>
>>>>> GuC provides engine_id and last_switch_in ticks for an active 
>>>>> context in
>>>>> the pphwsp. The context image provides a 32 bit total ticks which 
>>>>> is the
>>>>> accumulated by the context (a.k.a. context[CTX_TIMESTAMP]). This
>>>>> information is used to calculate the context busyness as follows:
>>>>>
>>>>> If the engine_id is valid, then busyness is the sum of accumulated 
>>>>> total
>>>>> ticks and active ticks. Active ticks is calculated with current gt 
>>>>> time
>>>>> as reference.
>>>>>
>>>>> If engine_id is invalid, busyness is equal to accumulated total ticks.
>>>>>
>>>>> Since KMD (CPU) retrieves busyness data from 2 sources - GPU and 
>>>>> GuC, a
>>>>> potential race was highlighted in an earlier review that can lead to
>>>>> double accounting of busyness. While the solution to this is a wip,
>>>>> busyness is still usable for platforms running GuC submission.
>>>>>
>>>>> v2: (Tvrtko)
>>>>> - Use COPS_RUNTIME_ACTIVE_TOTAL
>>>>> - Add code comment for the race
>>>>> - Undo local variables initializations
>>>>>
>>>>> v3:
>>>>> - Add support for virtual engines based on
>>>>>   https://patchwork.freedesktop.org/series/105227/
>>>>>
>>>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>>>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>>>> ---
>>>>>  drivers/gpu/drm/i915/gt/intel_context.c       | 12 +++-
>>>>>  drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
>>>>>  drivers/gpu/drm/i915/gt/intel_context_types.h |  6 ++
>>>>>  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
>>>>>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 65 
>>>>> ++++++++++++++++++-
>>>>>  drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
>>>>>  6 files changed, 89 insertions(+), 11 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
>>>>> b/drivers/gpu/drm/i915/gt/intel_context.c
>>>>> index 4070cb5711d8..4a84146710e0 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/intel_context.c
>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
>>>>> @@ -576,16 +576,24 @@ void intel_context_bind_parent_child(struct 
>>>>> intel_context *parent,
>>>>>      child->parallel.parent = parent;
>>>>>  }
>>>>> -u64 intel_context_get_total_runtime_ns(const struct intel_context 
>>>>> *ce)
>>>>> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>>>>>  {
>>>>>      u64 total, active;
>>>>> +    if (ce->ops->update_stats)
>>>>> +        ce->ops->update_stats(ce);
>>>>> +
>>>>>      total = ce->stats.runtime.total;
>>>>>      if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>>>>>          total *= ce->engine->gt->clock_period_ns;
>>>>>      active = READ_ONCE(ce->stats.active);
>>>>> -    if (active)
>>>>> +    /*
>>>>> +     * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the 
>>>>> backend
>>>>> +     * already provides the total active time of the context, so 
>>>>> skip this
>>>>> +     * calculation when this flag is set.
>>>>> +     */
>>>>> +    if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
>>>>>          active = intel_context_clock() - active;
>>>>>      return total + active;
>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
>>>>> b/drivers/gpu/drm/i915/gt/intel_context.h
>>>>> index b7d3214d2cdd..5fc7c19ab29b 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/intel_context.h
>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_context.h
>>>>> @@ -56,7 +56,7 @@ static inline bool intel_context_is_parent(struct 
>>>>> intel_context *ce)
>>>>>      return !!ce->parallel.number_children;
>>>>>  }
>>>>> -static inline bool intel_context_is_pinned(struct intel_context *ce);
>>>>> +static inline bool intel_context_is_pinned(const struct 
>>>>> intel_context *ce);
>>>>>  static inline struct intel_context *
>>>>>  intel_context_to_parent(struct intel_context *ce)
>>>>> @@ -116,7 +116,7 @@ static inline int 
>>>>> intel_context_lock_pinned(struct intel_context *ce)
>>>>>   * Returns: true if the context is currently pinned for use by the 
>>>>> GPU.
>>>>>   */
>>>>>  static inline bool
>>>>> -intel_context_is_pinned(struct intel_context *ce)
>>>>> +intel_context_is_pinned(const struct intel_context *ce)
>>>>>  {
>>>>>      return atomic_read(&ce->pin_count);
>>>>>  }
>>>>> @@ -351,7 +351,7 @@ intel_context_clear_nopreempt(struct 
>>>>> intel_context *ce)
>>>>>      clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
>>>>>  }
>>>>> -u64 intel_context_get_total_runtime_ns(const struct intel_context 
>>>>> *ce);
>>>>> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
>>>>>  u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
>>>>>  static inline u64 intel_context_clock(void)
>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
>>>>> b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>> index 09f82545789f..797bb4242c18 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>> @@ -38,6 +38,9 @@ struct intel_context_ops {
>>>>>  #define COPS_RUNTIME_CYCLES_BIT 1
>>>>>  #define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT)
>>>>> +#define COPS_RUNTIME_ACTIVE_TOTAL_BIT 2
>>>>> +#define COPS_RUNTIME_ACTIVE_TOTAL BIT(COPS_RUNTIME_ACTIVE_TOTAL_BIT)
>>>>> +
>>>>>      int (*alloc)(struct intel_context *ce);
>>>>>      void (*ban)(struct intel_context *ce, struct i915_request *rq);
>>>>> @@ -55,6 +58,8 @@ struct intel_context_ops {
>>>>>      void (*sched_disable)(struct intel_context *ce);
>>>>> +    void (*update_stats)(struct intel_context *ce);
>>>>> +
>>>>>      void (*reset)(struct intel_context *ce);
>>>>>      void (*destroy)(struct kref *kref);
>>>>> @@ -146,6 +151,7 @@ struct intel_context {
>>>>>              struct ewma_runtime avg;
>>>>>              u64 total;
>>>>>              u32 last;
>>>>> +            u64 start_gt_clk;
>>>>>              I915_SELFTEST_DECLARE(u32 num_underflow);
>>>>>              I915_SELFTEST_DECLARE(u32 max_underflow);
>>>>>          } runtime;
>>>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h 
>>>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>> index b3c9a9327f76..6231ad03e4eb 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>> @@ -196,6 +196,11 @@ static inline u8 guc_class_to_engine_class(u8 
>>>>> guc_class)
>>>>>      return guc_class_engine_class_map[guc_class];
>>>>>  }
>>>>> +/* Per context engine usage stats: */
>>>>> +#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO    (0x500 / sizeof(u32))
>>>>> +#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI 
>>>>> (PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO + 1)
>>>>> +#define PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID 
>>>>> (PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI + 1)
>>>>> +
>>>>>  /* Work item for submitting workloads into work queue of GuC. */
>>>>>  struct guc_wq_item {
>>>>>      u32 header;
>>>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
>>>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>> index 5a1dfacf24ea..cbf3cbb983ce 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>> @@ -378,7 +378,7 @@ static inline void 
>>>>> set_context_guc_id_invalid(struct intel_context *ce)
>>>>>      ce->guc_id.id = GUC_INVALID_CONTEXT_ID;
>>>>>  }
>>>>> -static inline struct intel_guc *ce_to_guc(struct intel_context *ce)
>>>>> +static inline struct intel_guc *ce_to_guc(const struct 
>>>>> intel_context *ce)
>>>>>  {
>>>>>      return &ce->engine->gt->uc.guc;
>>>>>  }
>>>>> @@ -1323,13 +1323,16 @@ static void 
>>>>> __update_guc_busyness_stats(struct intel_guc *guc)
>>>>>      spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>>>>  }
>>>>> +static void __guc_context_update_clks(struct intel_context *ce);
>>>>>  static void guc_timestamp_ping(struct work_struct *wrk)
>>>>>  {
>>>>>      struct intel_guc *guc = container_of(wrk, typeof(*guc),
>>>>>                           timestamp.work.work);
>>>>>      struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
>>>>>      struct intel_gt *gt = guc_to_gt(guc);
>>>>> +    struct intel_context *ce;
>>>>>      intel_wakeref_t wakeref;
>>>>> +    unsigned long index;
>>>>>      int srcu, ret;
>>>>>      /*
>>>>> @@ -1343,6 +1346,10 @@ static void guc_timestamp_ping(struct 
>>>>> work_struct *wrk)
>>>>>      with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>>>>>          __update_guc_busyness_stats(guc);
>>>>> +    /* adjust context stats for overflow */
>>>>> +    xa_for_each(&guc->context_lookup, index, ce)
>>>>> +        __guc_context_update_clks(ce);
>>>>> +
>>>>>      intel_gt_reset_unlock(gt, srcu);
>>>>>      mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>>>> @@ -1405,6 +1412,56 @@ void intel_guc_busyness_unpark(struct 
>>>>> intel_gt *gt)
>>>>>               guc->timestamp.ping_delay);
>>>>>  }
>>>>> +static void __guc_context_update_clks(struct intel_context *ce)
>>>>> +{
>>>>> +    struct intel_guc *guc = ce_to_guc(ce);
>>>>> +    struct intel_gt *gt = ce->engine->gt;
>>>>> +    u32 *pphwsp, last_switch, engine_id;
>>>>> +    u64 start_gt_clk, active;
>>>>> +    unsigned long flags;
>>>>> +    ktime_t unused;
>>>>> +
>>>>> +    spin_lock_irqsave(&guc->timestamp.lock, flags);
>>>>> +
>>>>> +    /*
>>>>> +     * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context 
>>>>> is switched
>>>>> +     * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
>>>>> +     * relies on GuC and GPU for busyness calculations. Due to 
>>>>> this, A
>>>>> +     * potential race was highlighted in an earlier review that 
>>>>> can lead to
>>>>> +     * double accounting of busyness. While the solution to this 
>>>>> is a wip,
>>>>> +     * busyness is still usable for platforms running GuC submission.
>>>>> +     */
>>>>> +    pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
>>>>> +    last_switch = 
>>>>> READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
>>>>> +    engine_id = 
>>>>> READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
>>>>> +
>>>>> +    guc_update_pm_timestamp(guc, &unused);
>>>>> +
>>>>> +    if (engine_id != 0xffffffff && last_switch) {
>>>>> +        start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
>>>>> +        __extend_last_switch(guc, &start_gt_clk, last_switch);
>>>>> +        active = intel_gt_clock_interval_to_ns(gt, 
>>>>> guc->timestamp.gt_stamp - start_gt_clk);
>>>>> +        WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
>>>>> +        WRITE_ONCE(ce->stats.active, active);
>>>>> +    } else {
>>>>> +        lrc_update_runtime(ce);
>>>>> +    }
>>>>> +
>>>>> +    spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>>>> +}
>>>>> +
>>>>> +static void guc_context_update_stats(struct intel_context *ce)
>>>>> +{
>>>>> +    if (!intel_context_pin_if_active(ce)) {
>>>>> +        WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
>>>>> +        WRITE_ONCE(ce->stats.active, 0);
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    __guc_context_update_clks(ce);
>>>>> +    intel_context_unpin(ce);
>>>>> +}
>>>>> +
>>>>>  static inline bool
>>>>>  submission_disabled(struct intel_guc *guc)
>>>>>  {
>>>>> @@ -2585,6 +2642,7 @@ static void guc_context_unpin(struct 
>>>>> intel_context *ce)
>>>>>  {
>>>>>      struct intel_guc *guc = ce_to_guc(ce);
>>>>> +    lrc_update_runtime(ce);
>>>>>      unpin_guc_id(guc, ce);
>>>>>      lrc_unpin(ce);
>>>>> @@ -3183,6 +3241,7 @@ static void remove_from_context(struct 
>>>>> i915_request *rq)
>>>>>  }
>>>>>  static const struct intel_context_ops guc_context_ops = {
>>>>> +    .flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>>>>      .alloc = guc_context_alloc,
>>>>>      .pre_pin = guc_context_pre_pin,
>>>>> @@ -3199,6 +3258,8 @@ static const struct intel_context_ops 
>>>>> guc_context_ops = {
>>>>>      .sched_disable = guc_context_sched_disable,
>>>>> +    .update_stats = guc_context_update_stats,
>>>>> +
>>>>>      .reset = lrc_reset,
>>>>>      .destroy = guc_context_destroy,
>>>>> @@ -3432,6 +3493,7 @@ static int guc_virtual_context_alloc(struct 
>>>>> intel_context *ce)
>>>>>  }
>>>>>  static const struct intel_context_ops virtual_guc_context_ops = {
>>>>> +    .flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>>>>      .alloc = guc_virtual_context_alloc,
>>>>>      .pre_pin = guc_virtual_context_pre_pin,
>>>>> @@ -3447,6 +3509,7 @@ static const struct intel_context_ops 
>>>>> virtual_guc_context_ops = {
>>>>>      .exit = guc_virtual_context_exit,
>>>>>      .sched_disable = guc_context_sched_disable,
>>>>> +    .update_stats = guc_context_update_stats,
>>>>
>>>> There are also virtual_parent_context_ops and 
>>>> virtual_child_context_ops - which means more test coverage is needed..
>>>
>>> Trying to come back to this... The 
>>> virtual_parent_context_ops/virtual_child_context_ops are used for 
>>> parallel engines. GuC would only update the pphwsp of the parent 
>>> context with the last_switched_in_time.
>>>
>>> In general, how should I report the busyness for a parallel engine?
>>>
>>> I would think it is busyness reported by parent context multiplied by 
>>> width.
>>
>> That could a reasonable approximation but I can't say for certain. 
>> Depends on the GuC scheduler implementation a bit. Like is anything 
>> preventing child contexts from finishing their useful work ahead of 
>> the parent context, or they are always strictly scheduled as one 
>> entity and child engines are blocked from taking other workloads until 
>> the parent is scheduled out?
> 
> Correct, if a child finishes the work before parent/siblings for some 
> reason, it cannot take up other work until all siblings are done.

The only problem is that I guess one day that assumption might break and 
we will "never" now. If you have some spare time it would be best to add 
an IGT to verify this assumption, or at least put that work as TODO in 
the backlog?

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-08-02  8:41         ` Tvrtko Ursulin
@ 2022-08-02 23:38           ` Umesh Nerlige Ramappa
  2022-08-04  1:21             ` Umesh Nerlige Ramappa
  0 siblings, 1 reply; 29+ messages in thread
From: Umesh Nerlige Ramappa @ 2022-08-02 23:38 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Tue, Aug 02, 2022 at 09:41:38AM +0100, Tvrtko Ursulin wrote:
>
>On 01/08/2022 20:02, Umesh Nerlige Ramappa wrote:
>>On Wed, Jul 27, 2022 at 09:48:18AM +0100, Tvrtko Ursulin wrote:
>>>
>>>On 27/07/2022 07:01, Umesh Nerlige Ramappa wrote:
>>>>On Fri, Jun 17, 2022 at 09:00:06AM +0100, Tvrtko Ursulin wrote:
>>>>>
>>>>>On 16/06/2022 23:13, Nerlige Ramappa, Umesh wrote:
>>>>>>From: John Harrison <John.C.Harrison@Intel.com>
>>>>>>
>>>>>>GuC provides engine_id and last_switch_in ticks for an 
>>>>>>active context in
>>>>>>the pphwsp. The context image provides a 32 bit total ticks 
>>>>>>which is the
>>>>>>accumulated by the context (a.k.a. context[CTX_TIMESTAMP]). This
>>>>>>information is used to calculate the context busyness as follows:
>>>>>>
>>>>>>If the engine_id is valid, then busyness is the sum of 
>>>>>>accumulated total
>>>>>>ticks and active ticks. Active ticks is calculated with 
>>>>>>current gt time
>>>>>>as reference.
>>>>>>
>>>>>>If engine_id is invalid, busyness is equal to accumulated total ticks.
>>>>>>
>>>>>>Since KMD (CPU) retrieves busyness data from 2 sources - GPU 
>>>>>>and GuC, a
>>>>>>potential race was highlighted in an earlier review that can lead to
>>>>>>double accounting of busyness. While the solution to this is a wip,
>>>>>>busyness is still usable for platforms running GuC submission.
>>>>>>
>>>>>>v2: (Tvrtko)
>>>>>>- Use COPS_RUNTIME_ACTIVE_TOTAL
>>>>>>- Add code comment for the race
>>>>>>- Undo local variables initializations
>>>>>>
>>>>>>v3:
>>>>>>- Add support for virtual engines based on
>>>>>>  https://patchwork.freedesktop.org/series/105227/
>>>>>>
>>>>>>Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>>>>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>>>>>---
>>>>>> drivers/gpu/drm/i915/gt/intel_context.c       | 12 +++-
>>>>>> drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
>>>>>> drivers/gpu/drm/i915/gt/intel_context_types.h |  6 ++
>>>>>> drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
>>>>>> .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 65 
>>>>>>++++++++++++++++++-
>>>>>> drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
>>>>>> 6 files changed, 89 insertions(+), 11 deletions(-)
>>>>>>
>>>>>>diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
>>>>>>b/drivers/gpu/drm/i915/gt/intel_context.c
>>>>>>index 4070cb5711d8..4a84146710e0 100644
>>>>>>--- a/drivers/gpu/drm/i915/gt/intel_context.c
>>>>>>+++ b/drivers/gpu/drm/i915/gt/intel_context.c
>>>>>>@@ -576,16 +576,24 @@ void 
>>>>>>intel_context_bind_parent_child(struct intel_context 
>>>>>>*parent,
>>>>>>     child->parallel.parent = parent;
>>>>>> }
>>>>>>-u64 intel_context_get_total_runtime_ns(const struct 
>>>>>>intel_context *ce)
>>>>>>+u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>>>>>> {
>>>>>>     u64 total, active;
>>>>>>+    if (ce->ops->update_stats)
>>>>>>+        ce->ops->update_stats(ce);
>>>>>>+
>>>>>>     total = ce->stats.runtime.total;
>>>>>>     if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>>>>>>         total *= ce->engine->gt->clock_period_ns;
>>>>>>     active = READ_ONCE(ce->stats.active);
>>>>>>-    if (active)
>>>>>>+    /*
>>>>>>+     * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, 
>>>>>>the backend
>>>>>>+     * already provides the total active time of the 
>>>>>>context, so skip this
>>>>>>+     * calculation when this flag is set.
>>>>>>+     */
>>>>>>+    if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
>>>>>>         active = intel_context_clock() - active;
>>>>>>     return total + active;
>>>>>>diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
>>>>>>b/drivers/gpu/drm/i915/gt/intel_context.h
>>>>>>index b7d3214d2cdd..5fc7c19ab29b 100644
>>>>>>--- a/drivers/gpu/drm/i915/gt/intel_context.h
>>>>>>+++ b/drivers/gpu/drm/i915/gt/intel_context.h
>>>>>>@@ -56,7 +56,7 @@ static inline bool 
>>>>>>intel_context_is_parent(struct intel_context *ce)
>>>>>>     return !!ce->parallel.number_children;
>>>>>> }
>>>>>>-static inline bool intel_context_is_pinned(struct intel_context *ce);
>>>>>>+static inline bool intel_context_is_pinned(const struct 
>>>>>>intel_context *ce);
>>>>>> static inline struct intel_context *
>>>>>> intel_context_to_parent(struct intel_context *ce)
>>>>>>@@ -116,7 +116,7 @@ static inline int 
>>>>>>intel_context_lock_pinned(struct intel_context *ce)
>>>>>>  * Returns: true if the context is currently pinned for use 
>>>>>>by the GPU.
>>>>>>  */
>>>>>> static inline bool
>>>>>>-intel_context_is_pinned(struct intel_context *ce)
>>>>>>+intel_context_is_pinned(const struct intel_context *ce)
>>>>>> {
>>>>>>     return atomic_read(&ce->pin_count);
>>>>>> }
>>>>>>@@ -351,7 +351,7 @@ intel_context_clear_nopreempt(struct 
>>>>>>intel_context *ce)
>>>>>>     clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
>>>>>> }
>>>>>>-u64 intel_context_get_total_runtime_ns(const struct 
>>>>>>intel_context *ce);
>>>>>>+u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
>>>>>> u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
>>>>>> static inline u64 intel_context_clock(void)
>>>>>>diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
>>>>>>b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>>>index 09f82545789f..797bb4242c18 100644
>>>>>>--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>>>+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>>>@@ -38,6 +38,9 @@ struct intel_context_ops {
>>>>>> #define COPS_RUNTIME_CYCLES_BIT 1
>>>>>> #define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT)
>>>>>>+#define COPS_RUNTIME_ACTIVE_TOTAL_BIT 2
>>>>>>+#define COPS_RUNTIME_ACTIVE_TOTAL BIT(COPS_RUNTIME_ACTIVE_TOTAL_BIT)
>>>>>>+
>>>>>>     int (*alloc)(struct intel_context *ce);
>>>>>>     void (*ban)(struct intel_context *ce, struct i915_request *rq);
>>>>>>@@ -55,6 +58,8 @@ struct intel_context_ops {
>>>>>>     void (*sched_disable)(struct intel_context *ce);
>>>>>>+    void (*update_stats)(struct intel_context *ce);
>>>>>>+
>>>>>>     void (*reset)(struct intel_context *ce);
>>>>>>     void (*destroy)(struct kref *kref);
>>>>>>@@ -146,6 +151,7 @@ struct intel_context {
>>>>>>             struct ewma_runtime avg;
>>>>>>             u64 total;
>>>>>>             u32 last;
>>>>>>+            u64 start_gt_clk;
>>>>>>             I915_SELFTEST_DECLARE(u32 num_underflow);
>>>>>>             I915_SELFTEST_DECLARE(u32 max_underflow);
>>>>>>         } runtime;
>>>>>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h 
>>>>>>b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>>>index b3c9a9327f76..6231ad03e4eb 100644
>>>>>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>>>@@ -196,6 +196,11 @@ static inline u8 
>>>>>>guc_class_to_engine_class(u8 guc_class)
>>>>>>     return guc_class_engine_class_map[guc_class];
>>>>>> }
>>>>>>+/* Per context engine usage stats: */
>>>>>>+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO    (0x500 / sizeof(u32))
>>>>>>+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI 
>>>>>>(PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO + 1)
>>>>>>+#define PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID 
>>>>>>(PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI + 1)
>>>>>>+
>>>>>> /* Work item for submitting workloads into work queue of GuC. */
>>>>>> struct guc_wq_item {
>>>>>>     u32 header;
>>>>>>diff --git 
>>>>>>a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
>>>>>>b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>>>index 5a1dfacf24ea..cbf3cbb983ce 100644
>>>>>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>>>@@ -378,7 +378,7 @@ static inline void 
>>>>>>set_context_guc_id_invalid(struct intel_context *ce)
>>>>>>     ce->guc_id.id = GUC_INVALID_CONTEXT_ID;
>>>>>> }
>>>>>>-static inline struct intel_guc *ce_to_guc(struct intel_context *ce)
>>>>>>+static inline struct intel_guc *ce_to_guc(const struct 
>>>>>>intel_context *ce)
>>>>>> {
>>>>>>     return &ce->engine->gt->uc.guc;
>>>>>> }
>>>>>>@@ -1323,13 +1323,16 @@ static void 
>>>>>>__update_guc_busyness_stats(struct intel_guc *guc)
>>>>>>     spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>>>>> }
>>>>>>+static void __guc_context_update_clks(struct intel_context *ce);
>>>>>> static void guc_timestamp_ping(struct work_struct *wrk)
>>>>>> {
>>>>>>     struct intel_guc *guc = container_of(wrk, typeof(*guc),
>>>>>>                          timestamp.work.work);
>>>>>>     struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
>>>>>>     struct intel_gt *gt = guc_to_gt(guc);
>>>>>>+    struct intel_context *ce;
>>>>>>     intel_wakeref_t wakeref;
>>>>>>+    unsigned long index;
>>>>>>     int srcu, ret;
>>>>>>     /*
>>>>>>@@ -1343,6 +1346,10 @@ static void guc_timestamp_ping(struct 
>>>>>>work_struct *wrk)
>>>>>>     with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>>>>>>         __update_guc_busyness_stats(guc);
>>>>>>+    /* adjust context stats for overflow */
>>>>>>+    xa_for_each(&guc->context_lookup, index, ce)
>>>>>>+        __guc_context_update_clks(ce);
>>>>>>+
>>>>>>     intel_gt_reset_unlock(gt, srcu);
>>>>>>     mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>>>>>@@ -1405,6 +1412,56 @@ void intel_guc_busyness_unpark(struct 
>>>>>>intel_gt *gt)
>>>>>>              guc->timestamp.ping_delay);
>>>>>> }
>>>>>>+static void __guc_context_update_clks(struct intel_context *ce)
>>>>>>+{
>>>>>>+    struct intel_guc *guc = ce_to_guc(ce);
>>>>>>+    struct intel_gt *gt = ce->engine->gt;
>>>>>>+    u32 *pphwsp, last_switch, engine_id;
>>>>>>+    u64 start_gt_clk, active;
>>>>>>+    unsigned long flags;
>>>>>>+    ktime_t unused;
>>>>>>+
>>>>>>+    spin_lock_irqsave(&guc->timestamp.lock, flags);
>>>>>>+
>>>>>>+    /*
>>>>>>+     * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when 
>>>>>>context is switched
>>>>>>+     * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
>>>>>>+     * relies on GuC and GPU for busyness calculations. Due 
>>>>>>to this, A
>>>>>>+     * potential race was highlighted in an earlier review 
>>>>>>that can lead to
>>>>>>+     * double accounting of busyness. While the solution to 
>>>>>>this is a wip,
>>>>>>+     * busyness is still usable for platforms running GuC submission.
>>>>>>+     */
>>>>>>+    pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
>>>>>>+    last_switch = 
>>>>>>READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
>>>>>>+    engine_id = 
>>>>>>READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
>>>>>>+
>>>>>>+    guc_update_pm_timestamp(guc, &unused);
>>>>>>+
>>>>>>+    if (engine_id != 0xffffffff && last_switch) {
>>>>>>+        start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
>>>>>>+        __extend_last_switch(guc, &start_gt_clk, last_switch);
>>>>>>+        active = intel_gt_clock_interval_to_ns(gt, 
>>>>>>guc->timestamp.gt_stamp - start_gt_clk);
>>>>>>+        WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
>>>>>>+        WRITE_ONCE(ce->stats.active, active);
>>>>>>+    } else {
>>>>>>+        lrc_update_runtime(ce);
>>>>>>+    }
>>>>>>+
>>>>>>+    spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>>>>>+}
>>>>>>+
>>>>>>+static void guc_context_update_stats(struct intel_context *ce)
>>>>>>+{
>>>>>>+    if (!intel_context_pin_if_active(ce)) {
>>>>>>+        WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
>>>>>>+        WRITE_ONCE(ce->stats.active, 0);
>>>>>>+        return;
>>>>>>+    }
>>>>>>+
>>>>>>+    __guc_context_update_clks(ce);
>>>>>>+    intel_context_unpin(ce);
>>>>>>+}
>>>>>>+
>>>>>> static inline bool
>>>>>> submission_disabled(struct intel_guc *guc)
>>>>>> {
>>>>>>@@ -2585,6 +2642,7 @@ static void guc_context_unpin(struct 
>>>>>>intel_context *ce)
>>>>>> {
>>>>>>     struct intel_guc *guc = ce_to_guc(ce);
>>>>>>+    lrc_update_runtime(ce);
>>>>>>     unpin_guc_id(guc, ce);
>>>>>>     lrc_unpin(ce);
>>>>>>@@ -3183,6 +3241,7 @@ static void remove_from_context(struct 
>>>>>>i915_request *rq)
>>>>>> }
>>>>>> static const struct intel_context_ops guc_context_ops = {
>>>>>>+    .flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>>>>>     .alloc = guc_context_alloc,
>>>>>>     .pre_pin = guc_context_pre_pin,
>>>>>>@@ -3199,6 +3258,8 @@ static const struct intel_context_ops 
>>>>>>guc_context_ops = {
>>>>>>     .sched_disable = guc_context_sched_disable,
>>>>>>+    .update_stats = guc_context_update_stats,
>>>>>>+
>>>>>>     .reset = lrc_reset,
>>>>>>     .destroy = guc_context_destroy,
>>>>>>@@ -3432,6 +3493,7 @@ static int 
>>>>>>guc_virtual_context_alloc(struct intel_context *ce)
>>>>>> }
>>>>>> static const struct intel_context_ops virtual_guc_context_ops = {
>>>>>>+    .flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>>>>>     .alloc = guc_virtual_context_alloc,
>>>>>>     .pre_pin = guc_virtual_context_pre_pin,
>>>>>>@@ -3447,6 +3509,7 @@ static const struct intel_context_ops 
>>>>>>virtual_guc_context_ops = {
>>>>>>     .exit = guc_virtual_context_exit,
>>>>>>     .sched_disable = guc_context_sched_disable,
>>>>>>+    .update_stats = guc_context_update_stats,
>>>>>
>>>>>There are also virtual_parent_context_ops and 
>>>>>virtual_child_context_ops - which means more test coverage is 
>>>>>needed..
>>>>
>>>>Trying to come back to this... The 
>>>>virtual_parent_context_ops/virtual_child_context_ops are used 
>>>>for parallel engines. GuC would only update the pphwsp of the 
>>>>parent context with the last_switched_in_time.
>>>>
>>>>In general, how should I report the busyness for a parallel engine?
>>>>
>>>>I would think it is busyness reported by parent context 
>>>>multiplied by width.
>>>
>>>That could a reasonable approximation but I can't say for certain. 
>>>Depends on the GuC scheduler implementation a bit. Like is 
>>>anything preventing child contexts from finishing their useful 
>>>work ahead of the parent context, or they are always strictly 
>>>scheduled as one entity and child engines are blocked from taking 
>>>other workloads until the parent is scheduled out?
>>
>>Correct, if a child finishes the work before parent/siblings for 
>>some reason, it cannot take up other work until all siblings are 
>>done.
>
>The only problem is that I guess one day that assumption might break 
>and we will "never" now. If you have some spare time it would be best 
>to add an IGT to verify this assumption, or at least put that work as 
>TODO in the backlog?

I added some tests to IGT for parallel engine, but something is missing 
in the way I am submitting the batches to the parallel engine. I see 
some hangs, haven't had a chance to debug that. Will try to get to it 
and then post the updated i915 patches.

Thanks,
Umesh

>
>Regards,
>
>Tvrtko

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-08-02 23:38           ` Umesh Nerlige Ramappa
@ 2022-08-04  1:21             ` Umesh Nerlige Ramappa
  2022-08-04  7:25               ` Tvrtko Ursulin
  0 siblings, 1 reply; 29+ messages in thread
From: Umesh Nerlige Ramappa @ 2022-08-04  1:21 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Tue, Aug 02, 2022 at 04:38:45PM -0700, Umesh Nerlige Ramappa wrote:
>On Tue, Aug 02, 2022 at 09:41:38AM +0100, Tvrtko Ursulin wrote:
>>
>>On 01/08/2022 20:02, Umesh Nerlige Ramappa wrote:
>>>On Wed, Jul 27, 2022 at 09:48:18AM +0100, Tvrtko Ursulin wrote:
>>>>
>>>>On 27/07/2022 07:01, Umesh Nerlige Ramappa wrote:
>>>>>On Fri, Jun 17, 2022 at 09:00:06AM +0100, Tvrtko Ursulin wrote:
>>>>>>
>>>>>>On 16/06/2022 23:13, Nerlige Ramappa, Umesh wrote:
>>>>>>>From: John Harrison <John.C.Harrison@Intel.com>
>>>>>>>
>>>>>>>GuC provides engine_id and last_switch_in ticks for an 
>>>>>>>active context in
>>>>>>>the pphwsp. The context image provides a 32 bit total 
>>>>>>>ticks which is the
>>>>>>>accumulated by the context (a.k.a. context[CTX_TIMESTAMP]). This
>>>>>>>information is used to calculate the context busyness as follows:
>>>>>>>
>>>>>>>If the engine_id is valid, then busyness is the sum of 
>>>>>>>accumulated total
>>>>>>>ticks and active ticks. Active ticks is calculated with 
>>>>>>>current gt time
>>>>>>>as reference.
>>>>>>>
>>>>>>>If engine_id is invalid, busyness is equal to accumulated total ticks.
>>>>>>>
>>>>>>>Since KMD (CPU) retrieves busyness data from 2 sources - 
>>>>>>>GPU and GuC, a
>>>>>>>potential race was highlighted in an earlier review that can lead to
>>>>>>>double accounting of busyness. While the solution to this is a wip,
>>>>>>>busyness is still usable for platforms running GuC submission.
>>>>>>>
>>>>>>>v2: (Tvrtko)
>>>>>>>- Use COPS_RUNTIME_ACTIVE_TOTAL
>>>>>>>- Add code comment for the race
>>>>>>>- Undo local variables initializations
>>>>>>>
>>>>>>>v3:
>>>>>>>- Add support for virtual engines based on
>>>>>>>  https://patchwork.freedesktop.org/series/105227/
>>>>>>>
>>>>>>>Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>>>>>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>>>>>>---
>>>>>>> drivers/gpu/drm/i915/gt/intel_context.c       | 12 +++-
>>>>>>> drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
>>>>>>> drivers/gpu/drm/i915/gt/intel_context_types.h |  6 ++
>>>>>>> drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
>>>>>>> .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 65 
>>>>>>>++++++++++++++++++-
>>>>>>> drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
>>>>>>> 6 files changed, 89 insertions(+), 11 deletions(-)
>>>>>>>
>>>>>>>diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
>>>>>>>b/drivers/gpu/drm/i915/gt/intel_context.c
>>>>>>>index 4070cb5711d8..4a84146710e0 100644
>>>>>>>--- a/drivers/gpu/drm/i915/gt/intel_context.c
>>>>>>>+++ b/drivers/gpu/drm/i915/gt/intel_context.c
>>>>>>>@@ -576,16 +576,24 @@ void 
>>>>>>>intel_context_bind_parent_child(struct intel_context 
>>>>>>>*parent,
>>>>>>>     child->parallel.parent = parent;
>>>>>>> }
>>>>>>>-u64 intel_context_get_total_runtime_ns(const struct 
>>>>>>>intel_context *ce)
>>>>>>>+u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>>>>>>> {
>>>>>>>     u64 total, active;
>>>>>>>+    if (ce->ops->update_stats)
>>>>>>>+        ce->ops->update_stats(ce);
>>>>>>>+
>>>>>>>     total = ce->stats.runtime.total;
>>>>>>>     if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>>>>>>>         total *= ce->engine->gt->clock_period_ns;
>>>>>>>     active = READ_ONCE(ce->stats.active);
>>>>>>>-    if (active)
>>>>>>>+    /*
>>>>>>>+     * When COPS_RUNTIME_ACTIVE_TOTAL is set for 
>>>>>>>ce->cops, the backend
>>>>>>>+     * already provides the total active time of the 
>>>>>>>context, so skip this
>>>>>>>+     * calculation when this flag is set.
>>>>>>>+     */
>>>>>>>+    if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
>>>>>>>         active = intel_context_clock() - active;
>>>>>>>     return total + active;
>>>>>>>diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
>>>>>>>b/drivers/gpu/drm/i915/gt/intel_context.h
>>>>>>>index b7d3214d2cdd..5fc7c19ab29b 100644
>>>>>>>--- a/drivers/gpu/drm/i915/gt/intel_context.h
>>>>>>>+++ b/drivers/gpu/drm/i915/gt/intel_context.h
>>>>>>>@@ -56,7 +56,7 @@ static inline bool 
>>>>>>>intel_context_is_parent(struct intel_context *ce)
>>>>>>>     return !!ce->parallel.number_children;
>>>>>>> }
>>>>>>>-static inline bool intel_context_is_pinned(struct intel_context *ce);
>>>>>>>+static inline bool intel_context_is_pinned(const struct 
>>>>>>>intel_context *ce);
>>>>>>> static inline struct intel_context *
>>>>>>> intel_context_to_parent(struct intel_context *ce)
>>>>>>>@@ -116,7 +116,7 @@ static inline int 
>>>>>>>intel_context_lock_pinned(struct intel_context *ce)
>>>>>>>  * Returns: true if the context is currently pinned for 
>>>>>>>use by the GPU.
>>>>>>>  */
>>>>>>> static inline bool
>>>>>>>-intel_context_is_pinned(struct intel_context *ce)
>>>>>>>+intel_context_is_pinned(const struct intel_context *ce)
>>>>>>> {
>>>>>>>     return atomic_read(&ce->pin_count);
>>>>>>> }
>>>>>>>@@ -351,7 +351,7 @@ intel_context_clear_nopreempt(struct 
>>>>>>>intel_context *ce)
>>>>>>>     clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
>>>>>>> }
>>>>>>>-u64 intel_context_get_total_runtime_ns(const struct 
>>>>>>>intel_context *ce);
>>>>>>>+u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
>>>>>>> u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
>>>>>>> static inline u64 intel_context_clock(void)
>>>>>>>diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
>>>>>>>b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>>>>index 09f82545789f..797bb4242c18 100644
>>>>>>>--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>>>>+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>>>>@@ -38,6 +38,9 @@ struct intel_context_ops {
>>>>>>> #define COPS_RUNTIME_CYCLES_BIT 1
>>>>>>> #define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT)
>>>>>>>+#define COPS_RUNTIME_ACTIVE_TOTAL_BIT 2
>>>>>>>+#define COPS_RUNTIME_ACTIVE_TOTAL BIT(COPS_RUNTIME_ACTIVE_TOTAL_BIT)
>>>>>>>+
>>>>>>>     int (*alloc)(struct intel_context *ce);
>>>>>>>     void (*ban)(struct intel_context *ce, struct i915_request *rq);
>>>>>>>@@ -55,6 +58,8 @@ struct intel_context_ops {
>>>>>>>     void (*sched_disable)(struct intel_context *ce);
>>>>>>>+    void (*update_stats)(struct intel_context *ce);
>>>>>>>+
>>>>>>>     void (*reset)(struct intel_context *ce);
>>>>>>>     void (*destroy)(struct kref *kref);
>>>>>>>@@ -146,6 +151,7 @@ struct intel_context {
>>>>>>>             struct ewma_runtime avg;
>>>>>>>             u64 total;
>>>>>>>             u32 last;
>>>>>>>+            u64 start_gt_clk;
>>>>>>>             I915_SELFTEST_DECLARE(u32 num_underflow);
>>>>>>>             I915_SELFTEST_DECLARE(u32 max_underflow);
>>>>>>>         } runtime;
>>>>>>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h 
>>>>>>>b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>>>>index b3c9a9327f76..6231ad03e4eb 100644
>>>>>>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>>>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>>>>@@ -196,6 +196,11 @@ static inline u8 
>>>>>>>guc_class_to_engine_class(u8 guc_class)
>>>>>>>     return guc_class_engine_class_map[guc_class];
>>>>>>> }
>>>>>>>+/* Per context engine usage stats: */
>>>>>>>+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO    (0x500 / sizeof(u32))
>>>>>>>+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI 
>>>>>>>(PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO + 1)
>>>>>>>+#define PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID 
>>>>>>>(PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI + 1)
>>>>>>>+
>>>>>>> /* Work item for submitting workloads into work queue of GuC. */
>>>>>>> struct guc_wq_item {
>>>>>>>     u32 header;
>>>>>>>diff --git 
>>>>>>>a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
>>>>>>>b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>>>>index 5a1dfacf24ea..cbf3cbb983ce 100644
>>>>>>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>>>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>>>>@@ -378,7 +378,7 @@ static inline void 
>>>>>>>set_context_guc_id_invalid(struct intel_context *ce)
>>>>>>>     ce->guc_id.id = GUC_INVALID_CONTEXT_ID;
>>>>>>> }
>>>>>>>-static inline struct intel_guc *ce_to_guc(struct intel_context *ce)
>>>>>>>+static inline struct intel_guc *ce_to_guc(const struct 
>>>>>>>intel_context *ce)
>>>>>>> {
>>>>>>>     return &ce->engine->gt->uc.guc;
>>>>>>> }
>>>>>>>@@ -1323,13 +1323,16 @@ static void 
>>>>>>>__update_guc_busyness_stats(struct intel_guc *guc)
>>>>>>>     spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>>>>>> }
>>>>>>>+static void __guc_context_update_clks(struct intel_context *ce);
>>>>>>> static void guc_timestamp_ping(struct work_struct *wrk)
>>>>>>> {
>>>>>>>     struct intel_guc *guc = container_of(wrk, typeof(*guc),
>>>>>>>                          timestamp.work.work);
>>>>>>>     struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
>>>>>>>     struct intel_gt *gt = guc_to_gt(guc);
>>>>>>>+    struct intel_context *ce;
>>>>>>>     intel_wakeref_t wakeref;
>>>>>>>+    unsigned long index;
>>>>>>>     int srcu, ret;
>>>>>>>     /*
>>>>>>>@@ -1343,6 +1346,10 @@ static void 
>>>>>>>guc_timestamp_ping(struct work_struct *wrk)
>>>>>>>     with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>>>>>>>         __update_guc_busyness_stats(guc);
>>>>>>>+    /* adjust context stats for overflow */
>>>>>>>+    xa_for_each(&guc->context_lookup, index, ce)
>>>>>>>+        __guc_context_update_clks(ce);
>>>>>>>+
>>>>>>>     intel_gt_reset_unlock(gt, srcu);
>>>>>>>     mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>>>>>>@@ -1405,6 +1412,56 @@ void 
>>>>>>>intel_guc_busyness_unpark(struct intel_gt *gt)
>>>>>>>              guc->timestamp.ping_delay);
>>>>>>> }
>>>>>>>+static void __guc_context_update_clks(struct intel_context *ce)
>>>>>>>+{
>>>>>>>+    struct intel_guc *guc = ce_to_guc(ce);
>>>>>>>+    struct intel_gt *gt = ce->engine->gt;
>>>>>>>+    u32 *pphwsp, last_switch, engine_id;
>>>>>>>+    u64 start_gt_clk, active;
>>>>>>>+    unsigned long flags;
>>>>>>>+    ktime_t unused;
>>>>>>>+
>>>>>>>+    spin_lock_irqsave(&guc->timestamp.lock, flags);
>>>>>>>+
>>>>>>>+    /*
>>>>>>>+     * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when 
>>>>>>>context is switched
>>>>>>>+     * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
>>>>>>>+     * relies on GuC and GPU for busyness calculations. 
>>>>>>>Due to this, A
>>>>>>>+     * potential race was highlighted in an earlier 
>>>>>>>review that can lead to
>>>>>>>+     * double accounting of busyness. While the solution 
>>>>>>>to this is a wip,
>>>>>>>+     * busyness is still usable for platforms running GuC submission.
>>>>>>>+     */
>>>>>>>+    pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
>>>>>>>+    last_switch = 
>>>>>>>READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
>>>>>>>+    engine_id = 
>>>>>>>READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
>>>>>>>+
>>>>>>>+    guc_update_pm_timestamp(guc, &unused);
>>>>>>>+
>>>>>>>+    if (engine_id != 0xffffffff && last_switch) {
>>>>>>>+        start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
>>>>>>>+        __extend_last_switch(guc, &start_gt_clk, last_switch);
>>>>>>>+        active = intel_gt_clock_interval_to_ns(gt, 
>>>>>>>guc->timestamp.gt_stamp - start_gt_clk);
>>>>>>>+        WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
>>>>>>>+        WRITE_ONCE(ce->stats.active, active);
>>>>>>>+    } else {
>>>>>>>+        lrc_update_runtime(ce);
>>>>>>>+    }
>>>>>>>+
>>>>>>>+    spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>>>>>>+}
>>>>>>>+
>>>>>>>+static void guc_context_update_stats(struct intel_context *ce)
>>>>>>>+{
>>>>>>>+    if (!intel_context_pin_if_active(ce)) {
>>>>>>>+        WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
>>>>>>>+        WRITE_ONCE(ce->stats.active, 0);
>>>>>>>+        return;
>>>>>>>+    }
>>>>>>>+
>>>>>>>+    __guc_context_update_clks(ce);
>>>>>>>+    intel_context_unpin(ce);
>>>>>>>+}
>>>>>>>+
>>>>>>> static inline bool
>>>>>>> submission_disabled(struct intel_guc *guc)
>>>>>>> {
>>>>>>>@@ -2585,6 +2642,7 @@ static void guc_context_unpin(struct 
>>>>>>>intel_context *ce)
>>>>>>> {
>>>>>>>     struct intel_guc *guc = ce_to_guc(ce);
>>>>>>>+    lrc_update_runtime(ce);
>>>>>>>     unpin_guc_id(guc, ce);
>>>>>>>     lrc_unpin(ce);
>>>>>>>@@ -3183,6 +3241,7 @@ static void 
>>>>>>>remove_from_context(struct i915_request *rq)
>>>>>>> }
>>>>>>> static const struct intel_context_ops guc_context_ops = {
>>>>>>>+    .flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>>>>>>     .alloc = guc_context_alloc,
>>>>>>>     .pre_pin = guc_context_pre_pin,
>>>>>>>@@ -3199,6 +3258,8 @@ static const struct 
>>>>>>>intel_context_ops guc_context_ops = {
>>>>>>>     .sched_disable = guc_context_sched_disable,
>>>>>>>+    .update_stats = guc_context_update_stats,
>>>>>>>+
>>>>>>>     .reset = lrc_reset,
>>>>>>>     .destroy = guc_context_destroy,
>>>>>>>@@ -3432,6 +3493,7 @@ static int 
>>>>>>>guc_virtual_context_alloc(struct intel_context *ce)
>>>>>>> }
>>>>>>> static const struct intel_context_ops virtual_guc_context_ops = {
>>>>>>>+    .flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>>>>>>     .alloc = guc_virtual_context_alloc,
>>>>>>>     .pre_pin = guc_virtual_context_pre_pin,
>>>>>>>@@ -3447,6 +3509,7 @@ static const struct 
>>>>>>>intel_context_ops virtual_guc_context_ops = {
>>>>>>>     .exit = guc_virtual_context_exit,
>>>>>>>     .sched_disable = guc_context_sched_disable,
>>>>>>>+    .update_stats = guc_context_update_stats,
>>>>>>
>>>>>>There are also virtual_parent_context_ops and 
>>>>>>virtual_child_context_ops - which means more test coverage 
>>>>>>is needed..
>>>>>
>>>>>Trying to come back to this... The 
>>>>>virtual_parent_context_ops/virtual_child_context_ops are used 
>>>>>for parallel engines. GuC would only update the pphwsp of the 
>>>>>parent context with the last_switched_in_time.
>>>>>
>>>>>In general, how should I report the busyness for a parallel engine?
>>>>>
>>>>>I would think it is busyness reported by parent context 
>>>>>multiplied by width.
>>>>
>>>>That could a reasonable approximation but I can't say for 
>>>>certain. Depends on the GuC scheduler implementation a bit. Like 
>>>>is anything preventing child contexts from finishing their 
>>>>useful work ahead of the parent context, or they are always 
>>>>strictly scheduled as one entity and child engines are blocked 
>>>>from taking other workloads until the parent is scheduled out?
>>>
>>>Correct, if a child finishes the work before parent/siblings for 
>>>some reason, it cannot take up other work until all siblings are 
>>>done.
>>
>>The only problem is that I guess one day that assumption might break 
>>and we will "never" now. If you have some spare time it would be 
>>best to add an IGT to verify this assumption, or at least put that 
>>work as TODO in the backlog?
>
>I added some tests to IGT for parallel engine, but something is 
>missing in the way I am submitting the batches to the parallel engine. 
>I see some hangs, haven't had a chance to debug that. Will try to get 
>to it and then post the updated i915 patches.

I think I may have to do the parallel engine testing later. Do you think 
this patch alone is good enough for now? It does not enable context 
busyness for parallel execution (which is just adding this 
COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL in the parent/child 
context ops)

If so, okay to post a rebased version?

Thanks,
Umesh
>
>Thanks,
>Umesh
>
>>
>>Regards,
>>
>>Tvrtko

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-08-04  1:21             ` Umesh Nerlige Ramappa
@ 2022-08-04  7:25               ` Tvrtko Ursulin
  0 siblings, 0 replies; 29+ messages in thread
From: Tvrtko Ursulin @ 2022-08-04  7:25 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx


On 04/08/2022 02:21, Umesh Nerlige Ramappa wrote:
> On Tue, Aug 02, 2022 at 04:38:45PM -0700, Umesh Nerlige Ramappa wrote:
>> On Tue, Aug 02, 2022 at 09:41:38AM +0100, Tvrtko Ursulin wrote:
>>>
>>> On 01/08/2022 20:02, Umesh Nerlige Ramappa wrote:
>>>> On Wed, Jul 27, 2022 at 09:48:18AM +0100, Tvrtko Ursulin wrote:
>>>>>
>>>>> On 27/07/2022 07:01, Umesh Nerlige Ramappa wrote:
>>>>>> On Fri, Jun 17, 2022 at 09:00:06AM +0100, Tvrtko Ursulin wrote:
>>>>>>>
>>>>>>> On 16/06/2022 23:13, Nerlige Ramappa, Umesh wrote:
>>>>>>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>>>>>>
>>>>>>>> GuC provides engine_id and last_switch_in ticks for an active 
>>>>>>>> context in
>>>>>>>> the pphwsp. The context image provides a 32 bit total ticks 
>>>>>>>> which is the
>>>>>>>> accumulated by the context (a.k.a. context[CTX_TIMESTAMP]). This
>>>>>>>> information is used to calculate the context busyness as follows:
>>>>>>>>
>>>>>>>> If the engine_id is valid, then busyness is the sum of 
>>>>>>>> accumulated total
>>>>>>>> ticks and active ticks. Active ticks is calculated with current 
>>>>>>>> gt time
>>>>>>>> as reference.
>>>>>>>>
>>>>>>>> If engine_id is invalid, busyness is equal to accumulated total 
>>>>>>>> ticks.
>>>>>>>>
>>>>>>>> Since KMD (CPU) retrieves busyness data from 2 sources - GPU and 
>>>>>>>> GuC, a
>>>>>>>> potential race was highlighted in an earlier review that can 
>>>>>>>> lead to
>>>>>>>> double accounting of busyness. While the solution to this is a wip,
>>>>>>>> busyness is still usable for platforms running GuC submission.
>>>>>>>>
>>>>>>>> v2: (Tvrtko)
>>>>>>>> - Use COPS_RUNTIME_ACTIVE_TOTAL
>>>>>>>> - Add code comment for the race
>>>>>>>> - Undo local variables initializations
>>>>>>>>
>>>>>>>> v3:
>>>>>>>> - Add support for virtual engines based on
>>>>>>>>   https://patchwork.freedesktop.org/series/105227/
>>>>>>>>
>>>>>>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>>>>>>> Signed-off-by: Umesh Nerlige Ramappa 
>>>>>>>> <umesh.nerlige.ramappa@intel.com>
>>>>>>>> ---
>>>>>>>>  drivers/gpu/drm/i915/gt/intel_context.c       | 12 +++-
>>>>>>>>  drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
>>>>>>>>  drivers/gpu/drm/i915/gt/intel_context_types.h |  6 ++
>>>>>>>>  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
>>>>>>>>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 65 
>>>>>>>> ++++++++++++++++++-
>>>>>>>>  drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
>>>>>>>>  6 files changed, 89 insertions(+), 11 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
>>>>>>>> b/drivers/gpu/drm/i915/gt/intel_context.c
>>>>>>>> index 4070cb5711d8..4a84146710e0 100644
>>>>>>>> --- a/drivers/gpu/drm/i915/gt/intel_context.c
>>>>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
>>>>>>>> @@ -576,16 +576,24 @@ void 
>>>>>>>> intel_context_bind_parent_child(struct intel_context *parent,
>>>>>>>>      child->parallel.parent = parent;
>>>>>>>>  }
>>>>>>>> -u64 intel_context_get_total_runtime_ns(const struct 
>>>>>>>> intel_context *ce)
>>>>>>>> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>>>>>>>>  {
>>>>>>>>      u64 total, active;
>>>>>>>> +    if (ce->ops->update_stats)
>>>>>>>> +        ce->ops->update_stats(ce);
>>>>>>>> +
>>>>>>>>      total = ce->stats.runtime.total;
>>>>>>>>      if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>>>>>>>>          total *= ce->engine->gt->clock_period_ns;
>>>>>>>>      active = READ_ONCE(ce->stats.active);
>>>>>>>> -    if (active)
>>>>>>>> +    /*
>>>>>>>> +     * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the 
>>>>>>>> backend
>>>>>>>> +     * already provides the total active time of the context, 
>>>>>>>> so skip this
>>>>>>>> +     * calculation when this flag is set.
>>>>>>>> +     */
>>>>>>>> +    if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
>>>>>>>>          active = intel_context_clock() - active;
>>>>>>>>      return total + active;
>>>>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
>>>>>>>> b/drivers/gpu/drm/i915/gt/intel_context.h
>>>>>>>> index b7d3214d2cdd..5fc7c19ab29b 100644
>>>>>>>> --- a/drivers/gpu/drm/i915/gt/intel_context.h
>>>>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_context.h
>>>>>>>> @@ -56,7 +56,7 @@ static inline bool 
>>>>>>>> intel_context_is_parent(struct intel_context *ce)
>>>>>>>>      return !!ce->parallel.number_children;
>>>>>>>>  }
>>>>>>>> -static inline bool intel_context_is_pinned(struct intel_context 
>>>>>>>> *ce);
>>>>>>>> +static inline bool intel_context_is_pinned(const struct 
>>>>>>>> intel_context *ce);
>>>>>>>>  static inline struct intel_context *
>>>>>>>>  intel_context_to_parent(struct intel_context *ce)
>>>>>>>> @@ -116,7 +116,7 @@ static inline int 
>>>>>>>> intel_context_lock_pinned(struct intel_context *ce)
>>>>>>>>   * Returns: true if the context is currently pinned for use by 
>>>>>>>> the GPU.
>>>>>>>>   */
>>>>>>>>  static inline bool
>>>>>>>> -intel_context_is_pinned(struct intel_context *ce)
>>>>>>>> +intel_context_is_pinned(const struct intel_context *ce)
>>>>>>>>  {
>>>>>>>>      return atomic_read(&ce->pin_count);
>>>>>>>>  }
>>>>>>>> @@ -351,7 +351,7 @@ intel_context_clear_nopreempt(struct 
>>>>>>>> intel_context *ce)
>>>>>>>>      clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
>>>>>>>>  }
>>>>>>>> -u64 intel_context_get_total_runtime_ns(const struct 
>>>>>>>> intel_context *ce);
>>>>>>>> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
>>>>>>>>  u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
>>>>>>>>  static inline u64 intel_context_clock(void)
>>>>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
>>>>>>>> b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>>>>> index 09f82545789f..797bb4242c18 100644
>>>>>>>> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>>>>> @@ -38,6 +38,9 @@ struct intel_context_ops {
>>>>>>>>  #define COPS_RUNTIME_CYCLES_BIT 1
>>>>>>>>  #define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT)
>>>>>>>> +#define COPS_RUNTIME_ACTIVE_TOTAL_BIT 2
>>>>>>>> +#define COPS_RUNTIME_ACTIVE_TOTAL 
>>>>>>>> BIT(COPS_RUNTIME_ACTIVE_TOTAL_BIT)
>>>>>>>> +
>>>>>>>>      int (*alloc)(struct intel_context *ce);
>>>>>>>>      void (*ban)(struct intel_context *ce, struct i915_request 
>>>>>>>> *rq);
>>>>>>>> @@ -55,6 +58,8 @@ struct intel_context_ops {
>>>>>>>>      void (*sched_disable)(struct intel_context *ce);
>>>>>>>> +    void (*update_stats)(struct intel_context *ce);
>>>>>>>> +
>>>>>>>>      void (*reset)(struct intel_context *ce);
>>>>>>>>      void (*destroy)(struct kref *kref);
>>>>>>>> @@ -146,6 +151,7 @@ struct intel_context {
>>>>>>>>              struct ewma_runtime avg;
>>>>>>>>              u64 total;
>>>>>>>>              u32 last;
>>>>>>>> +            u64 start_gt_clk;
>>>>>>>>              I915_SELFTEST_DECLARE(u32 num_underflow);
>>>>>>>>              I915_SELFTEST_DECLARE(u32 max_underflow);
>>>>>>>>          } runtime;
>>>>>>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h 
>>>>>>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>>>>> index b3c9a9327f76..6231ad03e4eb 100644
>>>>>>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>>>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>>>>> @@ -196,6 +196,11 @@ static inline u8 
>>>>>>>> guc_class_to_engine_class(u8 guc_class)
>>>>>>>>      return guc_class_engine_class_map[guc_class];
>>>>>>>>  }
>>>>>>>> +/* Per context engine usage stats: */
>>>>>>>> +#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO    (0x500 / sizeof(u32))
>>>>>>>> +#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI 
>>>>>>>> (PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO + 1)
>>>>>>>> +#define PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID 
>>>>>>>> (PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI + 1)
>>>>>>>> +
>>>>>>>>  /* Work item for submitting workloads into work queue of GuC. */
>>>>>>>>  struct guc_wq_item {
>>>>>>>>      u32 header;
>>>>>>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
>>>>>>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>>>>> index 5a1dfacf24ea..cbf3cbb983ce 100644
>>>>>>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>>>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>>>>> @@ -378,7 +378,7 @@ static inline void 
>>>>>>>> set_context_guc_id_invalid(struct intel_context *ce)
>>>>>>>>      ce->guc_id.id = GUC_INVALID_CONTEXT_ID;
>>>>>>>>  }
>>>>>>>> -static inline struct intel_guc *ce_to_guc(struct intel_context 
>>>>>>>> *ce)
>>>>>>>> +static inline struct intel_guc *ce_to_guc(const struct 
>>>>>>>> intel_context *ce)
>>>>>>>>  {
>>>>>>>>      return &ce->engine->gt->uc.guc;
>>>>>>>>  }
>>>>>>>> @@ -1323,13 +1323,16 @@ static void 
>>>>>>>> __update_guc_busyness_stats(struct intel_guc *guc)
>>>>>>>>      spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>>>>>>>  }
>>>>>>>> +static void __guc_context_update_clks(struct intel_context *ce);
>>>>>>>>  static void guc_timestamp_ping(struct work_struct *wrk)
>>>>>>>>  {
>>>>>>>>      struct intel_guc *guc = container_of(wrk, typeof(*guc),
>>>>>>>>                           timestamp.work.work);
>>>>>>>>      struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
>>>>>>>>      struct intel_gt *gt = guc_to_gt(guc);
>>>>>>>> +    struct intel_context *ce;
>>>>>>>>      intel_wakeref_t wakeref;
>>>>>>>> +    unsigned long index;
>>>>>>>>      int srcu, ret;
>>>>>>>>      /*
>>>>>>>> @@ -1343,6 +1346,10 @@ static void guc_timestamp_ping(struct 
>>>>>>>> work_struct *wrk)
>>>>>>>>      with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>>>>>>>>          __update_guc_busyness_stats(guc);
>>>>>>>> +    /* adjust context stats for overflow */
>>>>>>>> +    xa_for_each(&guc->context_lookup, index, ce)
>>>>>>>> +        __guc_context_update_clks(ce);
>>>>>>>> +
>>>>>>>>      intel_gt_reset_unlock(gt, srcu);
>>>>>>>>      mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>>>>>>> @@ -1405,6 +1412,56 @@ void intel_guc_busyness_unpark(struct 
>>>>>>>> intel_gt *gt)
>>>>>>>>               guc->timestamp.ping_delay);
>>>>>>>>  }
>>>>>>>> +static void __guc_context_update_clks(struct intel_context *ce)
>>>>>>>> +{
>>>>>>>> +    struct intel_guc *guc = ce_to_guc(ce);
>>>>>>>> +    struct intel_gt *gt = ce->engine->gt;
>>>>>>>> +    u32 *pphwsp, last_switch, engine_id;
>>>>>>>> +    u64 start_gt_clk, active;
>>>>>>>> +    unsigned long flags;
>>>>>>>> +    ktime_t unused;
>>>>>>>> +
>>>>>>>> +    spin_lock_irqsave(&guc->timestamp.lock, flags);
>>>>>>>> +
>>>>>>>> +    /*
>>>>>>>> +     * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when 
>>>>>>>> context is switched
>>>>>>>> +     * out, however GuC updates PPHWSP offsets below. Hence KMD 
>>>>>>>> (CPU)
>>>>>>>> +     * relies on GuC and GPU for busyness calculations. Due to 
>>>>>>>> this, A
>>>>>>>> +     * potential race was highlighted in an earlier review that 
>>>>>>>> can lead to
>>>>>>>> +     * double accounting of busyness. While the solution to 
>>>>>>>> this is a wip,
>>>>>>>> +     * busyness is still usable for platforms running GuC 
>>>>>>>> submission.
>>>>>>>> +     */
>>>>>>>> +    pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
>>>>>>>> +    last_switch = 
>>>>>>>> READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
>>>>>>>> +    engine_id = 
>>>>>>>> READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
>>>>>>>> +
>>>>>>>> +    guc_update_pm_timestamp(guc, &unused);
>>>>>>>> +
>>>>>>>> +    if (engine_id != 0xffffffff && last_switch) {
>>>>>>>> +        start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
>>>>>>>> +        __extend_last_switch(guc, &start_gt_clk, last_switch);
>>>>>>>> +        active = intel_gt_clock_interval_to_ns(gt, 
>>>>>>>> guc->timestamp.gt_stamp - start_gt_clk);
>>>>>>>> +        WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
>>>>>>>> +        WRITE_ONCE(ce->stats.active, active);
>>>>>>>> +    } else {
>>>>>>>> +        lrc_update_runtime(ce);
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void guc_context_update_stats(struct intel_context *ce)
>>>>>>>> +{
>>>>>>>> +    if (!intel_context_pin_if_active(ce)) {
>>>>>>>> +        WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
>>>>>>>> +        WRITE_ONCE(ce->stats.active, 0);
>>>>>>>> +        return;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    __guc_context_update_clks(ce);
>>>>>>>> +    intel_context_unpin(ce);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>  static inline bool
>>>>>>>>  submission_disabled(struct intel_guc *guc)
>>>>>>>>  {
>>>>>>>> @@ -2585,6 +2642,7 @@ static void guc_context_unpin(struct 
>>>>>>>> intel_context *ce)
>>>>>>>>  {
>>>>>>>>      struct intel_guc *guc = ce_to_guc(ce);
>>>>>>>> +    lrc_update_runtime(ce);
>>>>>>>>      unpin_guc_id(guc, ce);
>>>>>>>>      lrc_unpin(ce);
>>>>>>>> @@ -3183,6 +3241,7 @@ static void remove_from_context(struct 
>>>>>>>> i915_request *rq)
>>>>>>>>  }
>>>>>>>>  static const struct intel_context_ops guc_context_ops = {
>>>>>>>> +    .flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>>>>>>>      .alloc = guc_context_alloc,
>>>>>>>>      .pre_pin = guc_context_pre_pin,
>>>>>>>> @@ -3199,6 +3258,8 @@ static const struct intel_context_ops 
>>>>>>>> guc_context_ops = {
>>>>>>>>      .sched_disable = guc_context_sched_disable,
>>>>>>>> +    .update_stats = guc_context_update_stats,
>>>>>>>> +
>>>>>>>>      .reset = lrc_reset,
>>>>>>>>      .destroy = guc_context_destroy,
>>>>>>>> @@ -3432,6 +3493,7 @@ static int 
>>>>>>>> guc_virtual_context_alloc(struct intel_context *ce)
>>>>>>>>  }
>>>>>>>>  static const struct intel_context_ops virtual_guc_context_ops = {
>>>>>>>> +    .flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>>>>>>>      .alloc = guc_virtual_context_alloc,
>>>>>>>>      .pre_pin = guc_virtual_context_pre_pin,
>>>>>>>> @@ -3447,6 +3509,7 @@ static const struct intel_context_ops 
>>>>>>>> virtual_guc_context_ops = {
>>>>>>>>      .exit = guc_virtual_context_exit,
>>>>>>>>      .sched_disable = guc_context_sched_disable,
>>>>>>>> +    .update_stats = guc_context_update_stats,
>>>>>>>
>>>>>>> There are also virtual_parent_context_ops and 
>>>>>>> virtual_child_context_ops - which means more test coverage is 
>>>>>>> needed..
>>>>>>
>>>>>> Trying to come back to this... The 
>>>>>> virtual_parent_context_ops/virtual_child_context_ops are used for 
>>>>>> parallel engines. GuC would only update the pphwsp of the parent 
>>>>>> context with the last_switched_in_time.
>>>>>>
>>>>>> In general, how should I report the busyness for a parallel engine?
>>>>>>
>>>>>> I would think it is busyness reported by parent context multiplied 
>>>>>> by width.
>>>>>
>>>>> That could a reasonable approximation but I can't say for certain. 
>>>>> Depends on the GuC scheduler implementation a bit. Like is anything 
>>>>> preventing child contexts from finishing their useful work ahead of 
>>>>> the parent context, or they are always strictly scheduled as one 
>>>>> entity and child engines are blocked from taking other workloads 
>>>>> until the parent is scheduled out?
>>>>
>>>> Correct, if a child finishes the work before parent/siblings for 
>>>> some reason, it cannot take up other work until all siblings are done.
>>>
>>> The only problem is that I guess one day that assumption might break 
>>> and we will "never" now. If you have some spare time it would be best 
>>> to add an IGT to verify this assumption, or at least put that work as 
>>> TODO in the backlog?
>>
>> I added some tests to IGT for parallel engine, but something is 
>> missing in the way I am submitting the batches to the parallel engine. 
>> I see some hangs, haven't had a chance to debug that. Will try to get 
>> to it and then post the updated i915 patches.
> 
> I think I may have to do the parallel engine testing later. Do you think 
> this patch alone is good enough for now? It does not enable context 
> busyness for parallel execution (which is just adding this 
> COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL in the parent/child 
> context ops)
> 
> If so, okay to post a rebased version?

I think so. Just please file a jira for the outstanding work.

Thanks,

Tvrtko

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-08-31 20:25     ` Dixit, Ashutosh
@ 2022-08-31 22:57       ` Umesh Nerlige Ramappa
  0 siblings, 0 replies; 29+ messages in thread
From: Umesh Nerlige Ramappa @ 2022-08-31 22:57 UTC (permalink / raw)
  To: Dixit, Ashutosh; +Cc: intel-gfx

On Wed, Aug 31, 2022 at 01:25:11PM -0700, Dixit, Ashutosh wrote:
>On Fri, 26 Aug 2022 09:33:08 -0700, Umesh Nerlige Ramappa wrote:
>>
>
>Hi Umesh,
>
>Just to communicate my thoughts I have posted this patch on top of your
>patch:
>
>[1] https://patchwork.freedesktop.org/series/107983/
>
>Could you please take a look at that and see if it makes sense.
>
>> On Thu, Aug 25, 2022 at 06:44:50PM -0700, Dixit, Ashutosh wrote:
>> > On Thu, 04 Aug 2022 16:21:25 -0700, Umesh Nerlige Ramappa wrote:
>> >
>> > Hi Umesh, I am fairly new to this code so some questions will be below will
>> > be newbie questions, thanks for bearing with me.
>> >
>> >> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
>> >> index 654a092ed3d6..e2d70a9fdac0 100644
>> >> --- a/drivers/gpu/drm/i915/gt/intel_context.c
>> >> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
>> >> @@ -576,16 +576,24 @@ void intel_context_bind_parent_child(struct intel_context *parent,
>> >>	child->parallel.parent = parent;
>> >>  }
>> >>
>> >> -u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
>> >> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>> >>  {
>> >>	u64 total, active;
>> >>
>> >> +	if (ce->ops->update_stats)
>> >> +		ce->ops->update_stats(ce);
>> >> +
>> >>	total = ce->stats.runtime.total;
>> >>	if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>> >>		total *= ce->engine->gt->clock_period_ns;
>> >>
>> >>	active = READ_ONCE(ce->stats.active);
>> >> -	if (active)
>> >> +	/*
>> >> +	 * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the backend
>> >> +	 * already provides the total active time of the context, so skip this
>> >> +	 * calculation when this flag is set.
>> >> +	 */
>> >> +	if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
>> >>		active = intel_context_clock() - active;
>> >>
>> >>	return total + active;
>> >
>> > /snip/
>> >
>> >> @@ -1396,6 +1399,10 @@ static void guc_timestamp_ping(struct work_struct *wrk)
>> >>	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>> >>		__update_guc_busyness_stats(guc);
>> >>
>> >> +	/* adjust context stats for overflow */
>> >> +	xa_for_each(&guc->context_lookup, index, ce)
>> >> +		__guc_context_update_clks(ce);
>> >
>> > What is the reason for calling __guc_context_update_clks() periodically
>> > from guc_timestamp_ping() since it appears we should just be able to call
>> > __guc_context_update_clks() from intel_context_get_total_runtime_ns() to
>> > update 'active'? Is the reason for calling __guc_context_update_clks()
>> > periodically that the calculations in __guc_context_update_clks() become
>> > invalid if the counters overflow?
>>
>> Correct, these are 32-bit counters and the worker just tracks overflow.
>
>OK.
>
>>
>> >
>> >> +
>> >>	intel_gt_reset_unlock(gt, srcu);
>> >>
>> >>	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>> >> @@ -1469,6 +1476,56 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
>> >>			 guc->timestamp.ping_delay);
>> >>  }
>> >>
>> >> +static void __guc_context_update_clks(struct intel_context *ce)
>> >> +{
>> >> +	struct intel_guc *guc = ce_to_guc(ce);
>> >> +	struct intel_gt *gt = ce->engine->gt;
>> >> +	u32 *pphwsp, last_switch, engine_id;
>> >> +	u64 start_gt_clk, active;
>> >> +	unsigned long flags;
>> >> +	ktime_t unused;
>> >> +
>> >> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
>> >> +
>> >> +	/*
>> >> +	 * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context is switched
>> >> +	 * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
>> >> +	 * relies on GuC and GPU for busyness calculations. Due to this, A
>> >> +	 * potential race was highlighted in an earlier review that can lead to
>> >> +	 * double accounting of busyness. While the solution to this is a wip,
>> >> +	 * busyness is still usable for platforms running GuC submission.
>> >> +	 */
>> >> +	pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
>> >> +	last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
>> >> +	engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
>> >> +
>> >> +	guc_update_pm_timestamp(guc, &unused);
>> >> +
>> >> +	if (engine_id != 0xffffffff && last_switch) {
>> >> +		start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
>> >> +		__extend_last_switch(guc, &start_gt_clk, last_switch);
>> >> +		active = intel_gt_clock_interval_to_ns(gt, guc->timestamp.gt_stamp - start_gt_clk);
>> >> +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
>> >> +		WRITE_ONCE(ce->stats.active, active);
>> >
>> > Should not need WRITE_ONCE to update regular memory. Not even sure we need
>> > READ_ONCE above.
>>
>> Not sure I checked what they do. I was thinking these are needed for the
>> memory ordering (as in be sure that start_gt_clk is updated before
>> active).
>
>As long as our operations are done under correct locks we don't have to
>worry about memory ordering. That is one of the reasons I am doing
>everything under the spinlock in [1].
>
>>
>> >
>> >> +	} else {
>> >> +		lrc_update_runtime(ce);
>> >
>> > As was being discussed, should not need this here in this function. See
>> > below too.
>>
>> In short, I added this here so that a query for busyness following idle can
>> be obtained immediately. For GuC backend, the context is unpinned after
>> disabling scheduling on that context and that is asynchronous.  Also if
>> there are more requests on that context, the scheduling may not be disabled
>> and unpin may not happen, so updated runtime would only be seen much much
>> later.
>>
>> It is still safe to call from here because we know that the context is not
>> active and has switched out. If it did switch in while we were reading
>> this, that's still fine, we would only report the value stored in the
>> context image.
>
>Agreed, but in [1] I have made this unconditional, not sure if you will
>agree or see problems with that.

That would get called every second (default intel_gpu_top query 
internal) for a long running workload. multiply that with all active 
contexts.
>
>>
>> >
>> >> +	}
>> >> +
>> >> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>> >> +}
>> >> +
>> >> +static void guc_context_update_stats(struct intel_context *ce)
>> >> +{
>> >> +	if (!intel_context_pin_if_active(ce)) {
>> >> +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
>> >> +		WRITE_ONCE(ce->stats.active, 0);
>> >
>> > Why do these need to be initialized to 0? Looks like the calculations in
>> > __guc_context_update_clks() will work even if we don't do this? Also I
>> > didn't follow the 'if (!intel_context_pin_if_active(ce))' check.
>>
>> __guc_context_update_clks accesses the context image, so we need to make
>> sure it's pinned. pin if active will not sleep/wait, so we can use it in
>> this path.
>
>I have added pinning in [1].
>
>> if context is not active, then we update the active stats to 0.
>
>In [1] active is just a local variable and I don't touch ce->stats.active
>at all.
>
>> >> +		return;
>> >> +	}
>> >> +
>> >> +	__guc_context_update_clks(ce);
>> >> +	intel_context_unpin(ce);
>> >> +}
>> >> +
>> >>  static inline bool
>> >>  submission_disabled(struct intel_guc *guc)
>> >>  {
>> >> @@ -2723,6 +2780,7 @@ static void guc_context_unpin(struct intel_context *ce)
>> >>  {
>> >>	struct intel_guc *guc = cce_to_guc(ce);
>> >>
>> >> +	lrc_update_runtime(ce);
>> >
>> > How about moving this into lrc_unpin() since that gets called from all guc
>> > context types (parent/child/virtual).
>>
>> looks like lrc_unpin is called from context_unpin path.
>>
>> Same as above: for GuC, the context_unpin is an async operation and may not
>> happen if there are multiple requests in queue.
>
>In [1] I have left lrc_unpin in guc_context_unpin but changed to
>lrc_update_runtime_locked.

 From your rfc patch, I like
- the idea of not touching ce->stats.active
- having the update_stats return u64
- not doing a rmw for start_gt_clk

With those changes, we are only accessing total in ce->stats, so we 
don't really need a lrc_update_runtime_locked.

Thanks,
Umesh


>
>Thanks.
>--
>Ashutosh

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-08-26 16:33   ` Umesh Nerlige Ramappa
@ 2022-08-31 20:25     ` Dixit, Ashutosh
  2022-08-31 22:57       ` Umesh Nerlige Ramappa
  0 siblings, 1 reply; 29+ messages in thread
From: Dixit, Ashutosh @ 2022-08-31 20:25 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx

On Fri, 26 Aug 2022 09:33:08 -0700, Umesh Nerlige Ramappa wrote:
>

Hi Umesh,

Just to communicate my thoughts I have posted this patch on top of your
patch:

[1] https://patchwork.freedesktop.org/series/107983/

Could you please take a look at that and see if it makes sense.

> On Thu, Aug 25, 2022 at 06:44:50PM -0700, Dixit, Ashutosh wrote:
> > On Thu, 04 Aug 2022 16:21:25 -0700, Umesh Nerlige Ramappa wrote:
> >
> > Hi Umesh, I am fairly new to this code so some questions will be below will
> > be newbie questions, thanks for bearing with me.
> >
> >> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
> >> index 654a092ed3d6..e2d70a9fdac0 100644
> >> --- a/drivers/gpu/drm/i915/gt/intel_context.c
> >> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> >> @@ -576,16 +576,24 @@ void intel_context_bind_parent_child(struct intel_context *parent,
> >>	child->parallel.parent = parent;
> >>  }
> >>
> >> -u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
> >> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
> >>  {
> >>	u64 total, active;
> >>
> >> +	if (ce->ops->update_stats)
> >> +		ce->ops->update_stats(ce);
> >> +
> >>	total = ce->stats.runtime.total;
> >>	if (ce->ops->flags & COPS_RUNTIME_CYCLES)
> >>		total *= ce->engine->gt->clock_period_ns;
> >>
> >>	active = READ_ONCE(ce->stats.active);
> >> -	if (active)
> >> +	/*
> >> +	 * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the backend
> >> +	 * already provides the total active time of the context, so skip this
> >> +	 * calculation when this flag is set.
> >> +	 */
> >> +	if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
> >>		active = intel_context_clock() - active;
> >>
> >>	return total + active;
> >
> > /snip/
> >
> >> @@ -1396,6 +1399,10 @@ static void guc_timestamp_ping(struct work_struct *wrk)
> >>	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
> >>		__update_guc_busyness_stats(guc);
> >>
> >> +	/* adjust context stats for overflow */
> >> +	xa_for_each(&guc->context_lookup, index, ce)
> >> +		__guc_context_update_clks(ce);
> >
> > What is the reason for calling __guc_context_update_clks() periodically
> > from guc_timestamp_ping() since it appears we should just be able to call
> > __guc_context_update_clks() from intel_context_get_total_runtime_ns() to
> > update 'active'? Is the reason for calling __guc_context_update_clks()
> > periodically that the calculations in __guc_context_update_clks() become
> > invalid if the counters overflow?
>
> Correct, these are 32-bit counters and the worker just tracks overflow.

OK.

>
> >
> >> +
> >>	intel_gt_reset_unlock(gt, srcu);
> >>
> >>	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
> >> @@ -1469,6 +1476,56 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
> >>			 guc->timestamp.ping_delay);
> >>  }
> >>
> >> +static void __guc_context_update_clks(struct intel_context *ce)
> >> +{
> >> +	struct intel_guc *guc = ce_to_guc(ce);
> >> +	struct intel_gt *gt = ce->engine->gt;
> >> +	u32 *pphwsp, last_switch, engine_id;
> >> +	u64 start_gt_clk, active;
> >> +	unsigned long flags;
> >> +	ktime_t unused;
> >> +
> >> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> >> +
> >> +	/*
> >> +	 * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context is switched
> >> +	 * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
> >> +	 * relies on GuC and GPU for busyness calculations. Due to this, A
> >> +	 * potential race was highlighted in an earlier review that can lead to
> >> +	 * double accounting of busyness. While the solution to this is a wip,
> >> +	 * busyness is still usable for platforms running GuC submission.
> >> +	 */
> >> +	pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
> >> +	last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
> >> +	engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
> >> +
> >> +	guc_update_pm_timestamp(guc, &unused);
> >> +
> >> +	if (engine_id != 0xffffffff && last_switch) {
> >> +		start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
> >> +		__extend_last_switch(guc, &start_gt_clk, last_switch);
> >> +		active = intel_gt_clock_interval_to_ns(gt, guc->timestamp.gt_stamp - start_gt_clk);
> >> +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
> >> +		WRITE_ONCE(ce->stats.active, active);
> >
> > Should not need WRITE_ONCE to update regular memory. Not even sure we need
> > READ_ONCE above.
>
> Not sure I checked what they do. I was thinking these are needed for the
> memory ordering (as in be sure that start_gt_clk is updated before
> active).

As long as our operations are done under correct locks we don't have to
worry about memory ordering. That is one of the reasons I am doing
everything under the spinlock in [1].

>
> >
> >> +	} else {
> >> +		lrc_update_runtime(ce);
> >
> > As was being discussed, should not need this here in this function. See
> > below too.
>
> In short, I added this here so that a query for busyness following idle can
> be obtained immediately. For GuC backend, the context is unpinned after
> disabling scheduling on that context and that is asynchronous.  Also if
> there are more requests on that context, the scheduling may not be disabled
> and unpin may not happen, so updated runtime would only be seen much much
> later.
>
> It is still safe to call from here because we know that the context is not
> active and has switched out. If it did switch in while we were reading
> this, that's still fine, we would only report the value stored in the
> context image.

Agreed, but in [1] I have made this unconditional, not sure if you will
agree or see problems with that.

>
> >
> >> +	}
> >> +
> >> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> >> +}
> >> +
> >> +static void guc_context_update_stats(struct intel_context *ce)
> >> +{
> >> +	if (!intel_context_pin_if_active(ce)) {
> >> +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
> >> +		WRITE_ONCE(ce->stats.active, 0);
> >
> > Why do these need to be initialized to 0? Looks like the calculations in
> > __guc_context_update_clks() will work even if we don't do this? Also I
> > didn't follow the 'if (!intel_context_pin_if_active(ce))' check.
>
> __guc_context_update_clks accesses the context image, so we need to make
> sure it's pinned. pin if active will not sleep/wait, so we can use it in
> this path.

I have added pinning in [1].

> if context is not active, then we update the active stats to 0.

In [1] active is just a local variable and I don't touch ce->stats.active
at all.

> >> +		return;
> >> +	}
> >> +
> >> +	__guc_context_update_clks(ce);
> >> +	intel_context_unpin(ce);
> >> +}
> >> +
> >>  static inline bool
> >>  submission_disabled(struct intel_guc *guc)
> >>  {
> >> @@ -2723,6 +2780,7 @@ static void guc_context_unpin(struct intel_context *ce)
> >>  {
> >>	struct intel_guc *guc = cce_to_guc(ce);
> >>
> >> +	lrc_update_runtime(ce);
> >
> > How about moving this into lrc_unpin() since that gets called from all guc
> > context types (parent/child/virtual).
>
> looks like lrc_unpin is called from context_unpin path.
>
> Same as above: for GuC, the context_unpin is an async operation and may not
> happen if there are multiple requests in queue.

In [1] I have left lrc_unpin in guc_context_unpin but changed to
lrc_update_runtime_locked.

Thanks.
--
Ashutosh

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-08-26  1:44 ` Dixit, Ashutosh
@ 2022-08-26 16:33   ` Umesh Nerlige Ramappa
  2022-08-31 20:25     ` Dixit, Ashutosh
  0 siblings, 1 reply; 29+ messages in thread
From: Umesh Nerlige Ramappa @ 2022-08-26 16:33 UTC (permalink / raw)
  To: Dixit, Ashutosh; +Cc: intel-gfx

On Thu, Aug 25, 2022 at 06:44:50PM -0700, Dixit, Ashutosh wrote:
>On Thu, 04 Aug 2022 16:21:25 -0700, Umesh Nerlige Ramappa wrote:
>
>Hi Umesh, I am fairly new to this code so some questions will be below will
>be newbie questions, thanks for bearing with me.
>
>> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
>> index 654a092ed3d6..e2d70a9fdac0 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_context.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
>> @@ -576,16 +576,24 @@ void intel_context_bind_parent_child(struct intel_context *parent,
>>	child->parallel.parent = parent;
>>  }
>>
>> -u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
>> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>>  {
>>	u64 total, active;
>>
>> +	if (ce->ops->update_stats)
>> +		ce->ops->update_stats(ce);
>> +
>>	total = ce->stats.runtime.total;
>>	if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>>		total *= ce->engine->gt->clock_period_ns;
>>
>>	active = READ_ONCE(ce->stats.active);
>> -	if (active)
>> +	/*
>> +	 * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the backend
>> +	 * already provides the total active time of the context, so skip this
>> +	 * calculation when this flag is set.
>> +	 */
>> +	if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
>>		active = intel_context_clock() - active;
>>
>>	return total + active;
>
>/snip/
>
>> @@ -1396,6 +1399,10 @@ static void guc_timestamp_ping(struct work_struct *wrk)
>>	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>>		__update_guc_busyness_stats(guc);
>>
>> +	/* adjust context stats for overflow */
>> +	xa_for_each(&guc->context_lookup, index, ce)
>> +		__guc_context_update_clks(ce);
>
>What is the reason for calling __guc_context_update_clks() periodically
>from guc_timestamp_ping() since it appears we should just be able to call
>__guc_context_update_clks() from intel_context_get_total_runtime_ns() to
>update 'active'? Is the reason for calling __guc_context_update_clks()
>periodically that the calculations in __guc_context_update_clks() become
>invalid if the counters overflow?

Correct, these are 32-bit counters and the worker just tracks overflow.

>
>> +
>>	intel_gt_reset_unlock(gt, srcu);
>>
>>	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>> @@ -1469,6 +1476,56 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
>>			 guc->timestamp.ping_delay);
>>  }
>>
>> +static void __guc_context_update_clks(struct intel_context *ce)
>> +{
>> +	struct intel_guc *guc = ce_to_guc(ce);
>> +	struct intel_gt *gt = ce->engine->gt;
>> +	u32 *pphwsp, last_switch, engine_id;
>> +	u64 start_gt_clk, active;
>> +	unsigned long flags;
>> +	ktime_t unused;
>> +
>> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
>> +
>> +	/*
>> +	 * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context is switched
>> +	 * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
>> +	 * relies on GuC and GPU for busyness calculations. Due to this, A
>> +	 * potential race was highlighted in an earlier review that can lead to
>> +	 * double accounting of busyness. While the solution to this is a wip,
>> +	 * busyness is still usable for platforms running GuC submission.
>> +	 */
>> +	pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
>> +	last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
>> +	engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
>> +
>> +	guc_update_pm_timestamp(guc, &unused);
>> +
>> +	if (engine_id != 0xffffffff && last_switch) {
>> +		start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
>> +		__extend_last_switch(guc, &start_gt_clk, last_switch);
>> +		active = intel_gt_clock_interval_to_ns(gt, guc->timestamp.gt_stamp - start_gt_clk);
>> +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
>> +		WRITE_ONCE(ce->stats.active, active);
>
>Should not need WRITE_ONCE to update regular memory. Not even sure we need
>READ_ONCE above.

Not sure I checked what they do. I was thinking these are needed for the 
memory ordering (as in be sure that start_gt_clk is updated before 
active).

>
>> +	} else {
>> +		lrc_update_runtime(ce);
>
>As was being discussed, should not need this here in this function. See
>below too.

In short, I added this here so that a query for busyness following idle 
can be obtained immediately. For GuC backend, the context is unpinned 
after disabling scheduling on that context and that is asynchronous.  
Also if there are more requests on that context, the scheduling may not 
be disabled and unpin may not happen, so updated runtime would only be 
seen much much later.

It is still safe to call from here because we know that the context is 
not active and has switched out. If it did switch in while we were 
reading this, that's still fine, we would only report the value stored 
in the context image.

>
>> +	}
>> +
>> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>> +}
>> +
>> +static void guc_context_update_stats(struct intel_context *ce)
>> +{
>> +	if (!intel_context_pin_if_active(ce)) {
>> +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
>> +		WRITE_ONCE(ce->stats.active, 0);
>
>Why do these need to be initialized to 0? Looks like the calculations in
>__guc_context_update_clks() will work even if we don't do this? Also I
>didn't follow the 'if (!intel_context_pin_if_active(ce))' check.

__guc_context_update_clks accesses the context image, so we need to make 
sure it's pinned. pin if active will not sleep/wait, so we can use it in 
this path.

if context is not active, then we update the active stats to 0.

>
>> +		return;
>> +	}
>> +
>> +	__guc_context_update_clks(ce);
>> +	intel_context_unpin(ce);
>> +}
>> +
>>  static inline bool
>>  submission_disabled(struct intel_guc *guc)
>>  {
>> @@ -2723,6 +2780,7 @@ static void guc_context_unpin(struct intel_context *ce)
>>  {
>>	struct intel_guc *guc = cce_to_guc(ce);
>>
>> +	lrc_update_runtime(ce);
>
>How about moving this into lrc_unpin() since that gets called from all guc
>context types (parent/child/virtual).

looks like lrc_unpin is called from context_unpin path.

Same as above: for GuC, the context_unpin is an async operation and may 
not happen if there are multiple requests in queue. 

Thanks,
Umesh
>
>>	unpin_guc_id(guc, ce);
>>	lrc_unpin(ce);
>>
>> @@ -3344,6 +3402,7 @@ static void remove_from_context(struct i915_request *rq)
>>  }
>>
>>  static const struct intel_context_ops guc_context_ops = {
>> +	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>	.alloc = guc_context_alloc,
>>
>>	.pre_pin = guc_context_pre_pin,
>> @@ -3360,6 +3419,8 @@ static const struct intel_context_ops guc_context_ops = {
>>
>>	.sched_disable = guc_context_sched_disable,
>>
>> +	.update_stats = guc_context_update_stats,
>> +
>>	.reset = lrc_reset,
>>	.destroy = guc_context_destroy,
>>
>> @@ -3593,6 +3654,7 @@ static int guc_virtual_context_alloc(struct intel_context *ce)
>>  }
>>
>>  static const struct intel_context_ops virtual_guc_context_ops = {
>> +	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>	.alloc = guc_virtual_context_alloc,
>>
>>	.pre_pin = guc_virtual_context_pre_pin,
>> @@ -3608,6 +3670,7 @@ static const struct intel_context_ops virtual_guc_context_ops = {
>>	.exit = guc_virtual_context_exit,
>>
>>	.sched_disable = guc_context_sched_disable,
>> +	.update_stats = guc_context_update_stats,
>>
>>	.destroy = guc_context_destroy,
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c
>> index b09d1d386574..8d81119fff14 100644
>> --- a/drivers/gpu/drm/i915/i915_drm_client.c
>> +++ b/drivers/gpu/drm/i915/i915_drm_client.c
>> @@ -147,11 +147,7 @@ void i915_drm_client_fdinfo(struct seq_file *m, struct file *f)
>>		   PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
>>	seq_printf(m, "drm-client-id:\t%u\n", client->id);
>>
>> -	/*
>> -	 * Temporarily skip showing client engine information with GuC submission till
>> -	 * fetching engine busyness is implemented in the GuC submission backend
>> -	 */
>> -	if (GRAPHICS_VER(i915) < 8 || intel_uc_uses_guc_submission(&i915->gt0.uc))
>> +	if (GRAPHICS_VER(i915) < 8)
>>		return;
>>
>>	for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)
>> --
>> 2.37.1
>>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
       [not found]     ` <87fshl3yw0.wl-ashutosh.dixit@intel.com>
@ 2022-08-26 15:44       ` Umesh Nerlige Ramappa
  0 siblings, 0 replies; 29+ messages in thread
From: Umesh Nerlige Ramappa @ 2022-08-26 15:44 UTC (permalink / raw)
  To: Dixit, Ashutosh; +Cc: intel-gfx

On Wed, Aug 24, 2022 at 06:17:19PM -0700, Dixit, Ashutosh wrote:
>On Fri, 05 Aug 2022 08:18:48 -0700, Umesh Nerlige Ramappa wrote:
>>
>> On Fri, Aug 05, 2022 at 10:45:30AM +0100, Tvrtko Ursulin wrote:
>> >
>> > On 05/08/2022 00:21, Umesh Nerlige Ramappa wrote:
>> >> -static inline struct intel_guc *ce_to_guc(struct intel_context *ce)
>> >> +static inline struct intel_guc *ce_to_guc(const struct intel_context *ce)
>> >
>> > This is odd since the helper now takes away constness. I can't really
>> > figure out why the change is needed?
>
>Hi Umesh, I am also wondering about this, I think you missed answering this
>question from Tvrtko.

This helper 'adds' constness, so wasn't sure if the comment was intended 
for this helper.

Thanks,
Umesh

>
>Thanks.
>--
>Ashutosh

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-08-04 23:21 [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness Umesh Nerlige Ramappa
  2022-08-05  9:45 ` Tvrtko Ursulin
  2022-08-25  5:03 ` Dixit, Ashutosh
@ 2022-08-26  1:44 ` Dixit, Ashutosh
  2022-08-26 16:33   ` Umesh Nerlige Ramappa
  2 siblings, 1 reply; 29+ messages in thread
From: Dixit, Ashutosh @ 2022-08-26  1:44 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx

On Thu, 04 Aug 2022 16:21:25 -0700, Umesh Nerlige Ramappa wrote:

Hi Umesh, I am fairly new to this code so some questions will be below will
be newbie questions, thanks for bearing with me.

> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
> index 654a092ed3d6..e2d70a9fdac0 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.c
> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> @@ -576,16 +576,24 @@ void intel_context_bind_parent_child(struct intel_context *parent,
>	child->parallel.parent = parent;
>  }
>
> -u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>  {
>	u64 total, active;
>
> +	if (ce->ops->update_stats)
> +		ce->ops->update_stats(ce);
> +
>	total = ce->stats.runtime.total;
>	if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>		total *= ce->engine->gt->clock_period_ns;
>
>	active = READ_ONCE(ce->stats.active);
> -	if (active)
> +	/*
> +	 * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the backend
> +	 * already provides the total active time of the context, so skip this
> +	 * calculation when this flag is set.
> +	 */
> +	if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
>		active = intel_context_clock() - active;
>
>	return total + active;

/snip/

> @@ -1396,6 +1399,10 @@ static void guc_timestamp_ping(struct work_struct *wrk)
>	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>		__update_guc_busyness_stats(guc);
>
> +	/* adjust context stats for overflow */
> +	xa_for_each(&guc->context_lookup, index, ce)
> +		__guc_context_update_clks(ce);

What is the reason for calling __guc_context_update_clks() periodically
from guc_timestamp_ping() since it appears we should just be able to call
__guc_context_update_clks() from intel_context_get_total_runtime_ns() to
update 'active'? Is the reason for calling __guc_context_update_clks()
periodically that the calculations in __guc_context_update_clks() become
invalid if the counters overflow?

> +
>	intel_gt_reset_unlock(gt, srcu);
>
>	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
> @@ -1469,6 +1476,56 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
>			 guc->timestamp.ping_delay);
>  }
>
> +static void __guc_context_update_clks(struct intel_context *ce)
> +{
> +	struct intel_guc *guc = ce_to_guc(ce);
> +	struct intel_gt *gt = ce->engine->gt;
> +	u32 *pphwsp, last_switch, engine_id;
> +	u64 start_gt_clk, active;
> +	unsigned long flags;
> +	ktime_t unused;
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	/*
> +	 * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context is switched
> +	 * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
> +	 * relies on GuC and GPU for busyness calculations. Due to this, A
> +	 * potential race was highlighted in an earlier review that can lead to
> +	 * double accounting of busyness. While the solution to this is a wip,
> +	 * busyness is still usable for platforms running GuC submission.
> +	 */
> +	pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
> +	last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
> +	engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
> +
> +	guc_update_pm_timestamp(guc, &unused);
> +
> +	if (engine_id != 0xffffffff && last_switch) {
> +		start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
> +		__extend_last_switch(guc, &start_gt_clk, last_switch);
> +		active = intel_gt_clock_interval_to_ns(gt, guc->timestamp.gt_stamp - start_gt_clk);
> +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
> +		WRITE_ONCE(ce->stats.active, active);

Should not need WRITE_ONCE to update regular memory. Not even sure we need
READ_ONCE above.

> +	} else {
> +		lrc_update_runtime(ce);

As was being discussed, should not need this here in this function. See
below too.

> +	}
> +
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +}
> +
> +static void guc_context_update_stats(struct intel_context *ce)
> +{
> +	if (!intel_context_pin_if_active(ce)) {
> +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
> +		WRITE_ONCE(ce->stats.active, 0);

Why do these need to be initialized to 0? Looks like the calculations in
__guc_context_update_clks() will work even if we don't do this? Also I
didn't follow the 'if (!intel_context_pin_if_active(ce))' check.

> +		return;
> +	}
> +
> +	__guc_context_update_clks(ce);
> +	intel_context_unpin(ce);
> +}
> +
>  static inline bool
>  submission_disabled(struct intel_guc *guc)
>  {
> @@ -2723,6 +2780,7 @@ static void guc_context_unpin(struct intel_context *ce)
>  {
>	struct intel_guc *guc = ce_to_guc(ce);
>
> +	lrc_update_runtime(ce);

How about moving this into lrc_unpin() since that gets called from all guc
context types (parent/child/virtual).

>	unpin_guc_id(guc, ce);
>	lrc_unpin(ce);
>
> @@ -3344,6 +3402,7 @@ static void remove_from_context(struct i915_request *rq)
>  }
>
>  static const struct intel_context_ops guc_context_ops = {
> +	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>	.alloc = guc_context_alloc,
>
>	.pre_pin = guc_context_pre_pin,
> @@ -3360,6 +3419,8 @@ static const struct intel_context_ops guc_context_ops = {
>
>	.sched_disable = guc_context_sched_disable,
>
> +	.update_stats = guc_context_update_stats,
> +
>	.reset = lrc_reset,
>	.destroy = guc_context_destroy,
>
> @@ -3593,6 +3654,7 @@ static int guc_virtual_context_alloc(struct intel_context *ce)
>  }
>
>  static const struct intel_context_ops virtual_guc_context_ops = {
> +	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>	.alloc = guc_virtual_context_alloc,
>
>	.pre_pin = guc_virtual_context_pre_pin,
> @@ -3608,6 +3670,7 @@ static const struct intel_context_ops virtual_guc_context_ops = {
>	.exit = guc_virtual_context_exit,
>
>	.sched_disable = guc_context_sched_disable,
> +	.update_stats = guc_context_update_stats,
>
>	.destroy = guc_context_destroy,
>
> diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c
> index b09d1d386574..8d81119fff14 100644
> --- a/drivers/gpu/drm/i915/i915_drm_client.c
> +++ b/drivers/gpu/drm/i915/i915_drm_client.c
> @@ -147,11 +147,7 @@ void i915_drm_client_fdinfo(struct seq_file *m, struct file *f)
>		   PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
>	seq_printf(m, "drm-client-id:\t%u\n", client->id);
>
> -	/*
> -	 * Temporarily skip showing client engine information with GuC submission till
> -	 * fetching engine busyness is implemented in the GuC submission backend
> -	 */
> -	if (GRAPHICS_VER(i915) < 8 || intel_uc_uses_guc_submission(&i915->gt0.uc))
> +	if (GRAPHICS_VER(i915) < 8)
>		return;
>
>	for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)
> --
> 2.37.1
>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-08-25  5:03 ` Dixit, Ashutosh
@ 2022-08-25 21:12   ` Dixit, Ashutosh
  0 siblings, 0 replies; 29+ messages in thread
From: Dixit, Ashutosh @ 2022-08-25 21:12 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx

On Wed, 24 Aug 2022 22:03:19 -0700, Dixit, Ashutosh wrote:
>
> On Thu, 04 Aug 2022 16:21:25 -0700, Umesh Nerlige Ramappa wrote:
> >
>
> Hi Umesh,
>
> Still reviewing but I have a question below.

Please ignore this mail for now, mostly a result of my misunderstanding the
code. I will ask again if I have any questions. Thanks.

>
> > diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
> > index 654a092ed3d6..e2d70a9fdac0 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_context.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> > @@ -576,16 +576,24 @@ void intel_context_bind_parent_child(struct intel_context *parent,
> >	child->parallel.parent = parent;
> >  }
> >
> > -u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
> > +u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
> >  {
> >	u64 total, active;
> >
> > +	if (ce->ops->update_stats)
> > +		ce->ops->update_stats(ce);
> > +
>
> /snip/
>
> > @@ -1396,6 +1399,10 @@ static void guc_timestamp_ping(struct work_struct *wrk)
> >	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
> >		__update_guc_busyness_stats(guc);
> >
> > +	/* adjust context stats for overflow */
> > +	xa_for_each(&guc->context_lookup, index, ce)
> > +		__guc_context_update_clks(ce);
> > +
>
> The question is why do we have 2 functions: __guc_context_update_clks()
> (which we call periodically from guc_timestamp_ping()) and
> guc_context_update_stats() (which we call non-periodically from
> intel_context_get_total_runtime_ns()? Why don't we have just one function
> which is called from both places? Or rather why don't we call
> guc_context_update_stats() from both places?
>
> If we don't call guc_context_update_stats() periodically from
> guc_timestamp_ping() how e.g. does ce->stats.runtime.start_gt_clk get reset
> to 0? If it gets reset to 0 in __guc_context_update_clks() then why do we
> need to reset it in guc_context_update_stats()?
>
> Also IMO guc->timestamp.lock should be taken by this single function,
> (otherwise guc_context_update_stats() is modifying
> ce->stats.runtime.start_gt_clk without taking the lock).
>
> Thanks.
> --
> Ashutosh
>
> > +static void __guc_context_update_clks(struct intel_context *ce)
> > +{
> > +	struct intel_guc *guc = ce_to_guc(ce);
> > +	struct intel_gt *gt = ce->engine->gt;
> > +	u32 *pphwsp, last_switch, engine_id;
> > +	u64 start_gt_clk, active;
> > +	unsigned long flags;
> > +	ktime_t unused;
> > +
> > +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> > +
> > +	/*
> > +	 * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context is switched
> > +	 * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
> > +	 * relies on GuC and GPU for busyness calculations. Due to this, A
> > +	 * potential race was highlighted in an earlier review that can lead to
> > +	 * double accounting of busyness. While the solution to this is a wip,
> > +	 * busyness is still usable for platforms running GuC submission.
> > +	 */
> > +	pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
> > +	last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
> > +	engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
> > +
> > +	guc_update_pm_timestamp(guc, &unused);
> > +
> > +	if (engine_id != 0xffffffff && last_switch) {
> > +		start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
> > +		__extend_last_switch(guc, &start_gt_clk, last_switch);
> > +		active = intel_gt_clock_interval_to_ns(gt, guc->timestamp.gt_stamp - start_gt_clk);
> > +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
> > +		WRITE_ONCE(ce->stats.active, active);
> > +	} else {
> > +		lrc_update_runtime(ce);
> > +	}
> > +
> > +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> > +}
> > +
> > +static void guc_context_update_stats(struct intel_context *ce)
> > +{
> > +	if (!intel_context_pin_if_active(ce)) {
> > +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
> > +		WRITE_ONCE(ce->stats.active, 0);
> > +		return;
> > +	}
> > +
> > +	__guc_context_update_clks(ce);
> > +	intel_context_unpin(ce);
> > +}

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-06-15 17:42       ` Umesh Nerlige Ramappa
@ 2022-08-25  6:18         ` Dixit, Ashutosh
  0 siblings, 0 replies; 29+ messages in thread
From: Dixit, Ashutosh @ 2022-08-25  6:18 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx

On Wed, 15 Jun 2022 10:42:08 -0700, Umesh Nerlige Ramappa wrote:
>
> >>>> +static void __guc_context_update_clks(struct intel_context *ce)
> >>>> +{
> >>>> +    struct intel_guc *guc = ce_to_guc(ce);
> >>>> +    struct intel_gt *gt = ce->engine->gt;
> >>>> +    u32 *pphwsp, last_switch, engine_id;
> >>>> +    u64 start_gt_clk = 0, active = 0;
> >>>
> >>> No need to init these two.
> >>>
> >>>> +    unsigned long flags;
> >>>> +    ktime_t unused;
> >>>> +
> >>>> +    spin_lock_irqsave(&guc->timestamp.lock, flags);
> >>>> +
> >>>> +    pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
> >>>> +    last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
> >>>> +    engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
> >>>> +
> >>>> +    guc_update_pm_timestamp(guc, &unused);
> >>>> +
> >>>> +    if (engine_id != 0xffffffff && last_switch) {
> >>>> +        start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
> >>>> +        __extend_last_switch(guc, &start_gt_clk, last_switch);
> >>>> +        active = intel_gt_clock_interval_to_ns(gt,
> >>>> guc->timestamp.gt_stamp - start_gt_clk);
> >>>> +        WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
> >>>> +        WRITE_ONCE(ce->stats.active, active);
> >>>> +    } else {
> >>>> +        lrc_update_runtime(ce);
> >>>
> >>> Why is this called from here? Presumably it was called already from
> >>> guc_context_unpin if here code things context is not active. Or will be
> >>> called shortly, once context save is done.
> >>
> >> guc_context_unpin is only called in the path of ce->sched_disable. The
> >> sched_disable is implemented in GuC (H2G message). Once the
> >> corresponding G2H response is received, the context is actually
> >> unpinned, eventually calling guc_context_unpin. Also the context may not
> >> necessarily be disabled after each context exit.
> >
> > So if I understand correctly, lrc runtime is only updated if someone is
> > reading the busyness and not as part of normal context state transitions?
>
> If you mean context_in/out events (like csb interrupts), only GuC can see
> those events. KMD has no visibility into that. These 3 paths call
> lrc_update_runtime.
>
> user query: (engine_id != 0xffffffff && last_switch) translates to GuC
> being within context_in and context_out events, so updating it outside of
> this window is one way to report the correct busyness.
>
> worker: guc_timestamp_ping()  also updates context stats (infrequently) for
> all contexts primarily to take care of overflows.
>
> context unpin: Existing code calls lrc_update_runtime only when unpinning
> the context which takes care of accumulating busyness when requests are
> retired.

Will adding lrc_update_runtime() to lrc_unpin() work?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-08-04 23:21 [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness Umesh Nerlige Ramappa
  2022-08-05  9:45 ` Tvrtko Ursulin
@ 2022-08-25  5:03 ` Dixit, Ashutosh
  2022-08-25 21:12   ` Dixit, Ashutosh
  2022-08-26  1:44 ` Dixit, Ashutosh
  2 siblings, 1 reply; 29+ messages in thread
From: Dixit, Ashutosh @ 2022-08-25  5:03 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx

On Thu, 04 Aug 2022 16:21:25 -0700, Umesh Nerlige Ramappa wrote:
>

Hi Umesh,

Still reviewing but I have a question below.

> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
> index 654a092ed3d6..e2d70a9fdac0 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.c
> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> @@ -576,16 +576,24 @@ void intel_context_bind_parent_child(struct intel_context *parent,
>	child->parallel.parent = parent;
>  }
>
> -u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>  {
>	u64 total, active;
>
> +	if (ce->ops->update_stats)
> +		ce->ops->update_stats(ce);
> +

/snip/

> @@ -1396,6 +1399,10 @@ static void guc_timestamp_ping(struct work_struct *wrk)
>	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>		__update_guc_busyness_stats(guc);
>
> +	/* adjust context stats for overflow */
> +	xa_for_each(&guc->context_lookup, index, ce)
> +		__guc_context_update_clks(ce);
> +

The question is why do we have 2 functions: __guc_context_update_clks()
(which we call periodically from guc_timestamp_ping()) and
guc_context_update_stats() (which we call non-periodically from
intel_context_get_total_runtime_ns()? Why don't we have just one function
which is called from both places? Or rather why don't we call
guc_context_update_stats() from both places?

If we don't call guc_context_update_stats() periodically from
guc_timestamp_ping() how e.g. does ce->stats.runtime.start_gt_clk get reset
to 0? If it gets reset to 0 in __guc_context_update_clks() then why do we
need to reset it in guc_context_update_stats()?

Also IMO guc->timestamp.lock should be taken by this single function,
(otherwise guc_context_update_stats() is modifying
ce->stats.runtime.start_gt_clk without taking the lock).

Thanks.
--
Ashutosh

> +static void __guc_context_update_clks(struct intel_context *ce)
> +{
> +	struct intel_guc *guc = ce_to_guc(ce);
> +	struct intel_gt *gt = ce->engine->gt;
> +	u32 *pphwsp, last_switch, engine_id;
> +	u64 start_gt_clk, active;
> +	unsigned long flags;
> +	ktime_t unused;
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	/*
> +	 * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context is switched
> +	 * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
> +	 * relies on GuC and GPU for busyness calculations. Due to this, A
> +	 * potential race was highlighted in an earlier review that can lead to
> +	 * double accounting of busyness. While the solution to this is a wip,
> +	 * busyness is still usable for platforms running GuC submission.
> +	 */
> +	pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
> +	last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
> +	engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
> +
> +	guc_update_pm_timestamp(guc, &unused);
> +
> +	if (engine_id != 0xffffffff && last_switch) {
> +		start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
> +		__extend_last_switch(guc, &start_gt_clk, last_switch);
> +		active = intel_gt_clock_interval_to_ns(gt, guc->timestamp.gt_stamp - start_gt_clk);
> +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
> +		WRITE_ONCE(ce->stats.active, active);
> +	} else {
> +		lrc_update_runtime(ce);
> +	}
> +
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +}
> +
> +static void guc_context_update_stats(struct intel_context *ce)
> +{
> +	if (!intel_context_pin_if_active(ce)) {
> +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
> +		WRITE_ONCE(ce->stats.active, 0);
> +		return;
> +	}
> +
> +	__guc_context_update_clks(ce);
> +	intel_context_unpin(ce);
> +}

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-08-05  9:45 ` Tvrtko Ursulin
@ 2022-08-05 15:18   ` Umesh Nerlige Ramappa
       [not found]     ` <87fshl3yw0.wl-ashutosh.dixit@intel.com>
  0 siblings, 1 reply; 29+ messages in thread
From: Umesh Nerlige Ramappa @ 2022-08-05 15:18 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Fri, Aug 05, 2022 at 10:45:30AM +0100, Tvrtko Ursulin wrote:
>
>On 05/08/2022 00:21, Umesh Nerlige Ramappa wrote:
>>From: John Harrison <John.C.Harrison@Intel.com>
>>
>>GuC provides engine_id and last_switch_in ticks for an active context in
>>the pphwsp. The context image provides a 32 bit total ticks which is the
>>accumulated by the context (a.k.a. context[CTX_TIMESTAMP]). This
>>information is used to calculate the context busyness as follows:
>>
>>If the engine_id is valid, then busyness is the sum of accumulated total
>>ticks and active ticks. Active ticks is calculated with current gt time
>>as reference.
>>
>>If engine_id is invalid, busyness is equal to accumulated total ticks.
>>
>>Since KMD (CPU) retrieves busyness data from 2 sources - GPU and GuC, a
>>potential race was highlighted in an earlier review that can lead to
>>double accounting of busyness. While the solution to this is a wip,
>>busyness is still usable for platforms running GuC submission.
>>
>>Remaining work: Enable and test context busyness for
>>virtual_parent_context_ops and virtual_child_context_ops.
>
>I meant track the IGT work in the jira internally. :)

Oh, I did do that and added this here as well. Note that I have not 
enabled the busyness in i915 for the parent/child context ops since I 
was not able to verify it yet.

>
>Otherwise:
>
>Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>
>Also, can someone else please do the full review? I'm afraid with the 
>passage of time I forgot what little I knew about how GuC tracks this 
>data. :(

I will ask around

>
>Some nits and questions below.
>
>>v2: (Tvrtko)
>>- Use COPS_RUNTIME_ACTIVE_TOTAL
>>- Add code comment for the race
>>- Undo local variables initializations
>>
>>v3:
>>- Add support for virtual engines based on
>>   https://patchwork.freedesktop.org/series/105227/
>>
>>v4:
>>- Update commit message with remaining work.
>>- Rebase
>>
>>Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>---
>>  drivers/gpu/drm/i915/gt/intel_context.c       | 12 +++-
>>  drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
>>  drivers/gpu/drm/i915/gt/intel_context_types.h |  6 ++
>>  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
>>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 65 ++++++++++++++++++-
>>  drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
>>  6 files changed, 89 insertions(+), 11 deletions(-)
>>
>>diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
>>index 654a092ed3d6..e2d70a9fdac0 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_context.c
>>+++ b/drivers/gpu/drm/i915/gt/intel_context.c
>>@@ -576,16 +576,24 @@ void intel_context_bind_parent_child(struct intel_context *parent,
>>  	child->parallel.parent = parent;
>>  }
>>-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
>>+u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>>  {
>>  	u64 total, active;
>>+	if (ce->ops->update_stats)
>>+		ce->ops->update_stats(ce);
>>+
>>  	total = ce->stats.runtime.total;
>>  	if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>>  		total *= ce->engine->gt->clock_period_ns;
>>  	active = READ_ONCE(ce->stats.active);
>>-	if (active)
>>+	/*
>>+	 * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the backend
>>+	 * already provides the total active time of the context, so skip this
>>+	 * calculation when this flag is set.
>>+	 */
>>+	if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
>>  		active = intel_context_clock() - active;
>>  	return total + active;
>>diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
>>index 8e2d70630c49..3d1d7436c1a4 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_context.h
>>+++ b/drivers/gpu/drm/i915/gt/intel_context.h
>>@@ -58,7 +58,7 @@ static inline bool intel_context_is_parent(struct intel_context *ce)
>>  	return !!ce->parallel.number_children;
>>  }
>>-static inline bool intel_context_is_pinned(struct intel_context *ce);
>>+static inline bool intel_context_is_pinned(const struct intel_context *ce);
>>  static inline struct intel_context *
>>  intel_context_to_parent(struct intel_context *ce)
>>@@ -118,7 +118,7 @@ static inline int intel_context_lock_pinned(struct intel_context *ce)
>>   * Returns: true if the context is currently pinned for use by the GPU.
>>   */
>>  static inline bool
>>-intel_context_is_pinned(struct intel_context *ce)
>>+intel_context_is_pinned(const struct intel_context *ce)
>>  {
>>  	return atomic_read(&ce->pin_count);
>>  }
>>@@ -362,7 +362,7 @@ intel_context_clear_nopreempt(struct intel_context *ce)
>>  	clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
>>  }
>>-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce);
>>+u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
>>  u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
>>  static inline u64 intel_context_clock(void)
>>diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>index 04eacae1aca5..f7ff4c7d81c7 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
>>+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>@@ -38,6 +38,9 @@ struct intel_context_ops {
>>  #define COPS_RUNTIME_CYCLES_BIT 1
>>  #define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT)
>>+#define COPS_RUNTIME_ACTIVE_TOTAL_BIT 2
>>+#define COPS_RUNTIME_ACTIVE_TOTAL BIT(COPS_RUNTIME_ACTIVE_TOTAL_BIT)
>>+
>>  	int (*alloc)(struct intel_context *ce);
>>  	void (*revoke)(struct intel_context *ce, struct i915_request *rq,
>>@@ -56,6 +59,8 @@ struct intel_context_ops {
>>  	void (*sched_disable)(struct intel_context *ce);
>>+	void (*update_stats)(struct intel_context *ce);
>>+
>>  	void (*reset)(struct intel_context *ce);
>>  	void (*destroy)(struct kref *kref);
>>@@ -148,6 +153,7 @@ struct intel_context {
>>  			struct ewma_runtime avg;
>>  			u64 total;
>>  			u32 last;
>>+			u64 start_gt_clk;
>
>Nit - put u64 next to u64 and u32 next to u32 to avoid holes.
>
>>  			I915_SELFTEST_DECLARE(u32 num_underflow);
>>  			I915_SELFTEST_DECLARE(u32 max_underflow);
>>  		} runtime;
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>index 323b055e5db9..c7b54f1631b9 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>@@ -196,6 +196,11 @@ static inline u8 guc_class_to_engine_class(u8 guc_class)
>>  	return guc_class_engine_class_map[guc_class];
>>  }
>>+/* Per context engine usage stats: */
>>+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO	(0x500 / sizeof(u32))
>>+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO + 1)
>>+#define PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI + 1)
>>+
>>  /* Work item for submitting workloads into work queue of GuC. */
>>  struct guc_wq_item {
>>  	u32 header;
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>index 0d17da77e787..c9fefa254a7e 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>@@ -378,7 +378,7 @@ static inline void set_context_guc_id_invalid(struct intel_context *ce)
>>  	ce->guc_id.id = GUC_INVALID_CONTEXT_ID;
>>  }
>>-static inline struct intel_guc *ce_to_guc(struct intel_context *ce)
>>+static inline struct intel_guc *ce_to_guc(const struct intel_context *ce)
>
>This is odd since the helper now takes away constness. I can't really 
>figure out why the change is needed?
>
>>  {
>>  	return &ce->engine->gt->uc.guc;
>>  }
>>@@ -1376,13 +1376,16 @@ static void __update_guc_busyness_stats(struct intel_guc *guc)
>>  	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>  }
>>+static void __guc_context_update_clks(struct intel_context *ce);
>>  static void guc_timestamp_ping(struct work_struct *wrk)
>>  {
>>  	struct intel_guc *guc = container_of(wrk, typeof(*guc),
>>  					     timestamp.work.work);
>>  	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
>>  	struct intel_gt *gt = guc_to_gt(guc);
>>+	struct intel_context *ce;
>>  	intel_wakeref_t wakeref;
>>+	unsigned long index;
>>  	int srcu, ret;
>>  	/*
>>@@ -1396,6 +1399,10 @@ static void guc_timestamp_ping(struct work_struct *wrk)
>>  	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>>  		__update_guc_busyness_stats(guc);
>>+	/* adjust context stats for overflow */
>>+	xa_for_each(&guc->context_lookup, index, ce)
>>+		__guc_context_update_clks(ce);
>>+
>>  	intel_gt_reset_unlock(gt, srcu);
>>  	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>@@ -1469,6 +1476,56 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
>>  			 guc->timestamp.ping_delay);
>>  }
>>+static void __guc_context_update_clks(struct intel_context *ce)
>>+{
>>+	struct intel_guc *guc = ce_to_guc(ce);
>>+	struct intel_gt *gt = ce->engine->gt;
>>+	u32 *pphwsp, last_switch, engine_id;
>>+	u64 start_gt_clk, active;
>>+	unsigned long flags;
>>+	ktime_t unused;
>>+
>>+	spin_lock_irqsave(&guc->timestamp.lock, flags);
>>+
>>+	/*
>>+	 * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context is switched
>>+	 * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
>>+	 * relies on GuC and GPU for busyness calculations. Due to this, A
>>+	 * potential race was highlighted in an earlier review that can lead to
>>+	 * double accounting of busyness. While the solution to this is a wip,
>>+	 * busyness is still usable for platforms running GuC submission.
>>+	 */
>>+	pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
>>+	last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
>
>What about PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI? I see it defined but 
>isn't used so is the timestmap 32 bit just ABI reserved 64 bits for 
>future proofing or something?

Yes, the _HI is not populated by GuC yet, but reserved for future 
extension to 64 bits.

Thanks,
Umesh

>
>Regards,
>
>Tvrtko
>
>>+	engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
>>+
>>+	guc_update_pm_timestamp(guc, &unused);
>>+
>>+	if (engine_id != 0xffffffff && last_switch) {
>>+		start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
>>+		__extend_last_switch(guc, &start_gt_clk, last_switch);
>>+		active = intel_gt_clock_interval_to_ns(gt, guc->timestamp.gt_stamp - start_gt_clk);
>>+		WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
>>+		WRITE_ONCE(ce->stats.active, active);
>>+	} else {
>>+		lrc_update_runtime(ce);
>>+	}
>>+
>>+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>+}
>>+
>>+static void guc_context_update_stats(struct intel_context *ce)
>>+{
>>+	if (!intel_context_pin_if_active(ce)) {
>>+		WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
>>+		WRITE_ONCE(ce->stats.active, 0);
>>+		return;
>>+	}
>>+
>>+	__guc_context_update_clks(ce);
>>+	intel_context_unpin(ce);
>>+}
>>+
>>  static inline bool
>>  submission_disabled(struct intel_guc *guc)
>>  {
>>@@ -2723,6 +2780,7 @@ static void guc_context_unpin(struct intel_context *ce)
>>  {
>>  	struct intel_guc *guc = ce_to_guc(ce);
>>+	lrc_update_runtime(ce);
>>  	unpin_guc_id(guc, ce);
>>  	lrc_unpin(ce);
>>@@ -3344,6 +3402,7 @@ static void remove_from_context(struct i915_request *rq)
>>  }
>>  static const struct intel_context_ops guc_context_ops = {
>>+	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>  	.alloc = guc_context_alloc,
>>  	.pre_pin = guc_context_pre_pin,
>>@@ -3360,6 +3419,8 @@ static const struct intel_context_ops guc_context_ops = {
>>  	.sched_disable = guc_context_sched_disable,
>>+	.update_stats = guc_context_update_stats,
>>+
>>  	.reset = lrc_reset,
>>  	.destroy = guc_context_destroy,
>>@@ -3593,6 +3654,7 @@ static int guc_virtual_context_alloc(struct intel_context *ce)
>>  }
>>  static const struct intel_context_ops virtual_guc_context_ops = {
>>+	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>  	.alloc = guc_virtual_context_alloc,
>>  	.pre_pin = guc_virtual_context_pre_pin,
>>@@ -3608,6 +3670,7 @@ static const struct intel_context_ops virtual_guc_context_ops = {
>>  	.exit = guc_virtual_context_exit,
>>  	.sched_disable = guc_context_sched_disable,
>>+	.update_stats = guc_context_update_stats,
>>  	.destroy = guc_context_destroy,
>>diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c
>>index b09d1d386574..8d81119fff14 100644
>>--- a/drivers/gpu/drm/i915/i915_drm_client.c
>>+++ b/drivers/gpu/drm/i915/i915_drm_client.c
>>@@ -147,11 +147,7 @@ void i915_drm_client_fdinfo(struct seq_file *m, struct file *f)
>>  		   PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
>>  	seq_printf(m, "drm-client-id:\t%u\n", client->id);
>>-	/*
>>-	 * Temporarily skip showing client engine information with GuC submission till
>>-	 * fetching engine busyness is implemented in the GuC submission backend
>>-	 */
>>-	if (GRAPHICS_VER(i915) < 8 || intel_uc_uses_guc_submission(&i915->gt0.uc))
>>+	if (GRAPHICS_VER(i915) < 8)
>>  		return;
>>  	for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-08-04 23:21 [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness Umesh Nerlige Ramappa
@ 2022-08-05  9:45 ` Tvrtko Ursulin
  2022-08-05 15:18   ` Umesh Nerlige Ramappa
  2022-08-25  5:03 ` Dixit, Ashutosh
  2022-08-26  1:44 ` Dixit, Ashutosh
  2 siblings, 1 reply; 29+ messages in thread
From: Tvrtko Ursulin @ 2022-08-05  9:45 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa, intel-gfx


On 05/08/2022 00:21, Umesh Nerlige Ramappa wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> GuC provides engine_id and last_switch_in ticks for an active context in
> the pphwsp. The context image provides a 32 bit total ticks which is the
> accumulated by the context (a.k.a. context[CTX_TIMESTAMP]). This
> information is used to calculate the context busyness as follows:
> 
> If the engine_id is valid, then busyness is the sum of accumulated total
> ticks and active ticks. Active ticks is calculated with current gt time
> as reference.
> 
> If engine_id is invalid, busyness is equal to accumulated total ticks.
> 
> Since KMD (CPU) retrieves busyness data from 2 sources - GPU and GuC, a
> potential race was highlighted in an earlier review that can lead to
> double accounting of busyness. While the solution to this is a wip,
> busyness is still usable for platforms running GuC submission.
> 
> Remaining work: Enable and test context busyness for
> virtual_parent_context_ops and virtual_child_context_ops.

I meant track the IGT work in the jira internally. :)

Otherwise:

Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Also, can someone else please do the full review? I'm afraid with the 
passage of time I forgot what little I knew about how GuC tracks this 
data. :(

Some nits and questions below.

> v2: (Tvrtko)
> - Use COPS_RUNTIME_ACTIVE_TOTAL
> - Add code comment for the race
> - Undo local variables initializations
> 
> v3:
> - Add support for virtual engines based on
>    https://patchwork.freedesktop.org/series/105227/
> 
> v4:
> - Update commit message with remaining work.
> - Rebase
> 
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_context.c       | 12 +++-
>   drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
>   drivers/gpu/drm/i915/gt/intel_context_types.h |  6 ++
>   drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 65 ++++++++++++++++++-
>   drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
>   6 files changed, 89 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
> index 654a092ed3d6..e2d70a9fdac0 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.c
> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> @@ -576,16 +576,24 @@ void intel_context_bind_parent_child(struct intel_context *parent,
>   	child->parallel.parent = parent;
>   }
>   
> -u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>   {
>   	u64 total, active;
>   
> +	if (ce->ops->update_stats)
> +		ce->ops->update_stats(ce);
> +
>   	total = ce->stats.runtime.total;
>   	if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>   		total *= ce->engine->gt->clock_period_ns;
>   
>   	active = READ_ONCE(ce->stats.active);
> -	if (active)
> +	/*
> +	 * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the backend
> +	 * already provides the total active time of the context, so skip this
> +	 * calculation when this flag is set.
> +	 */
> +	if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
>   		active = intel_context_clock() - active;
>   
>   	return total + active;
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
> index 8e2d70630c49..3d1d7436c1a4 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context.h
> @@ -58,7 +58,7 @@ static inline bool intel_context_is_parent(struct intel_context *ce)
>   	return !!ce->parallel.number_children;
>   }
>   
> -static inline bool intel_context_is_pinned(struct intel_context *ce);
> +static inline bool intel_context_is_pinned(const struct intel_context *ce);
>   
>   static inline struct intel_context *
>   intel_context_to_parent(struct intel_context *ce)
> @@ -118,7 +118,7 @@ static inline int intel_context_lock_pinned(struct intel_context *ce)
>    * Returns: true if the context is currently pinned for use by the GPU.
>    */
>   static inline bool
> -intel_context_is_pinned(struct intel_context *ce)
> +intel_context_is_pinned(const struct intel_context *ce)
>   {
>   	return atomic_read(&ce->pin_count);
>   }
> @@ -362,7 +362,7 @@ intel_context_clear_nopreempt(struct intel_context *ce)
>   	clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
>   }
>   
> -u64 intel_context_get_total_runtime_ns(const struct intel_context *ce);
> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
>   u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
>   
>   static inline u64 intel_context_clock(void)
> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
> index 04eacae1aca5..f7ff4c7d81c7 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> @@ -38,6 +38,9 @@ struct intel_context_ops {
>   #define COPS_RUNTIME_CYCLES_BIT 1
>   #define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT)
>   
> +#define COPS_RUNTIME_ACTIVE_TOTAL_BIT 2
> +#define COPS_RUNTIME_ACTIVE_TOTAL BIT(COPS_RUNTIME_ACTIVE_TOTAL_BIT)
> +
>   	int (*alloc)(struct intel_context *ce);
>   
>   	void (*revoke)(struct intel_context *ce, struct i915_request *rq,
> @@ -56,6 +59,8 @@ struct intel_context_ops {
>   
>   	void (*sched_disable)(struct intel_context *ce);
>   
> +	void (*update_stats)(struct intel_context *ce);
> +
>   	void (*reset)(struct intel_context *ce);
>   	void (*destroy)(struct kref *kref);
>   
> @@ -148,6 +153,7 @@ struct intel_context {
>   			struct ewma_runtime avg;
>   			u64 total;
>   			u32 last;
> +			u64 start_gt_clk;

Nit - put u64 next to u64 and u32 next to u32 to avoid holes.

>   			I915_SELFTEST_DECLARE(u32 num_underflow);
>   			I915_SELFTEST_DECLARE(u32 max_underflow);
>   		} runtime;
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> index 323b055e5db9..c7b54f1631b9 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> @@ -196,6 +196,11 @@ static inline u8 guc_class_to_engine_class(u8 guc_class)
>   	return guc_class_engine_class_map[guc_class];
>   }
>   
> +/* Per context engine usage stats: */
> +#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO	(0x500 / sizeof(u32))
> +#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO + 1)
> +#define PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI + 1)
> +
>   /* Work item for submitting workloads into work queue of GuC. */
>   struct guc_wq_item {
>   	u32 header;
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 0d17da77e787..c9fefa254a7e 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -378,7 +378,7 @@ static inline void set_context_guc_id_invalid(struct intel_context *ce)
>   	ce->guc_id.id = GUC_INVALID_CONTEXT_ID;
>   }
>   
> -static inline struct intel_guc *ce_to_guc(struct intel_context *ce)
> +static inline struct intel_guc *ce_to_guc(const struct intel_context *ce)

This is odd since the helper now takes away constness. I can't really 
figure out why the change is needed?

>   {
>   	return &ce->engine->gt->uc.guc;
>   }
> @@ -1376,13 +1376,16 @@ static void __update_guc_busyness_stats(struct intel_guc *guc)
>   	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>   }
>   
> +static void __guc_context_update_clks(struct intel_context *ce);
>   static void guc_timestamp_ping(struct work_struct *wrk)
>   {
>   	struct intel_guc *guc = container_of(wrk, typeof(*guc),
>   					     timestamp.work.work);
>   	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
>   	struct intel_gt *gt = guc_to_gt(guc);
> +	struct intel_context *ce;
>   	intel_wakeref_t wakeref;
> +	unsigned long index;
>   	int srcu, ret;
>   
>   	/*
> @@ -1396,6 +1399,10 @@ static void guc_timestamp_ping(struct work_struct *wrk)
>   	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>   		__update_guc_busyness_stats(guc);
>   
> +	/* adjust context stats for overflow */
> +	xa_for_each(&guc->context_lookup, index, ce)
> +		__guc_context_update_clks(ce);
> +
>   	intel_gt_reset_unlock(gt, srcu);
>   
>   	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
> @@ -1469,6 +1476,56 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
>   			 guc->timestamp.ping_delay);
>   }
>   
> +static void __guc_context_update_clks(struct intel_context *ce)
> +{
> +	struct intel_guc *guc = ce_to_guc(ce);
> +	struct intel_gt *gt = ce->engine->gt;
> +	u32 *pphwsp, last_switch, engine_id;
> +	u64 start_gt_clk, active;
> +	unsigned long flags;
> +	ktime_t unused;
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	/*
> +	 * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context is switched
> +	 * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
> +	 * relies on GuC and GPU for busyness calculations. Due to this, A
> +	 * potential race was highlighted in an earlier review that can lead to
> +	 * double accounting of busyness. While the solution to this is a wip,
> +	 * busyness is still usable for platforms running GuC submission.
> +	 */
> +	pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
> +	last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);

What about PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI? I see it defined but isn't 
used so is the timestmap 32 bit just ABI reserved 64 bits for future 
proofing or something?

Regards,

Tvrtko

> +	engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
> +
> +	guc_update_pm_timestamp(guc, &unused);
> +
> +	if (engine_id != 0xffffffff && last_switch) {
> +		start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
> +		__extend_last_switch(guc, &start_gt_clk, last_switch);
> +		active = intel_gt_clock_interval_to_ns(gt, guc->timestamp.gt_stamp - start_gt_clk);
> +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
> +		WRITE_ONCE(ce->stats.active, active);
> +	} else {
> +		lrc_update_runtime(ce);
> +	}
> +
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +}
> +
> +static void guc_context_update_stats(struct intel_context *ce)
> +{
> +	if (!intel_context_pin_if_active(ce)) {
> +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
> +		WRITE_ONCE(ce->stats.active, 0);
> +		return;
> +	}
> +
> +	__guc_context_update_clks(ce);
> +	intel_context_unpin(ce);
> +}
> +
>   static inline bool
>   submission_disabled(struct intel_guc *guc)
>   {
> @@ -2723,6 +2780,7 @@ static void guc_context_unpin(struct intel_context *ce)
>   {
>   	struct intel_guc *guc = ce_to_guc(ce);
>   
> +	lrc_update_runtime(ce);
>   	unpin_guc_id(guc, ce);
>   	lrc_unpin(ce);
>   
> @@ -3344,6 +3402,7 @@ static void remove_from_context(struct i915_request *rq)
>   }
>   
>   static const struct intel_context_ops guc_context_ops = {
> +	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>   	.alloc = guc_context_alloc,
>   
>   	.pre_pin = guc_context_pre_pin,
> @@ -3360,6 +3419,8 @@ static const struct intel_context_ops guc_context_ops = {
>   
>   	.sched_disable = guc_context_sched_disable,
>   
> +	.update_stats = guc_context_update_stats,
> +
>   	.reset = lrc_reset,
>   	.destroy = guc_context_destroy,
>   
> @@ -3593,6 +3654,7 @@ static int guc_virtual_context_alloc(struct intel_context *ce)
>   }
>   
>   static const struct intel_context_ops virtual_guc_context_ops = {
> +	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>   	.alloc = guc_virtual_context_alloc,
>   
>   	.pre_pin = guc_virtual_context_pre_pin,
> @@ -3608,6 +3670,7 @@ static const struct intel_context_ops virtual_guc_context_ops = {
>   	.exit = guc_virtual_context_exit,
>   
>   	.sched_disable = guc_context_sched_disable,
> +	.update_stats = guc_context_update_stats,
>   
>   	.destroy = guc_context_destroy,
>   
> diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c
> index b09d1d386574..8d81119fff14 100644
> --- a/drivers/gpu/drm/i915/i915_drm_client.c
> +++ b/drivers/gpu/drm/i915/i915_drm_client.c
> @@ -147,11 +147,7 @@ void i915_drm_client_fdinfo(struct seq_file *m, struct file *f)
>   		   PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
>   	seq_printf(m, "drm-client-id:\t%u\n", client->id);
>   
> -	/*
> -	 * Temporarily skip showing client engine information with GuC submission till
> -	 * fetching engine busyness is implemented in the GuC submission backend
> -	 */
> -	if (GRAPHICS_VER(i915) < 8 || intel_uc_uses_guc_submission(&i915->gt0.uc))
> +	if (GRAPHICS_VER(i915) < 8)
>   		return;
>   
>   	for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
@ 2022-08-04 23:21 Umesh Nerlige Ramappa
  2022-08-05  9:45 ` Tvrtko Ursulin
                   ` (2 more replies)
  0 siblings, 3 replies; 29+ messages in thread
From: Umesh Nerlige Ramappa @ 2022-08-04 23:21 UTC (permalink / raw)
  To: intel-gfx, Tvrtko Ursulin

From: John Harrison <John.C.Harrison@Intel.com>

GuC provides engine_id and last_switch_in ticks for an active context in
the pphwsp. The context image provides a 32 bit total ticks which is the
accumulated by the context (a.k.a. context[CTX_TIMESTAMP]). This
information is used to calculate the context busyness as follows:

If the engine_id is valid, then busyness is the sum of accumulated total
ticks and active ticks. Active ticks is calculated with current gt time
as reference.

If engine_id is invalid, busyness is equal to accumulated total ticks.

Since KMD (CPU) retrieves busyness data from 2 sources - GPU and GuC, a
potential race was highlighted in an earlier review that can lead to
double accounting of busyness. While the solution to this is a wip,
busyness is still usable for platforms running GuC submission.

Remaining work: Enable and test context busyness for
virtual_parent_context_ops and virtual_child_context_ops.

v2: (Tvrtko)
- Use COPS_RUNTIME_ACTIVE_TOTAL
- Add code comment for the race
- Undo local variables initializations

v3:
- Add support for virtual engines based on
  https://patchwork.freedesktop.org/series/105227/

v4:
- Update commit message with remaining work.
- Rebase

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_context.c       | 12 +++-
 drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
 drivers/gpu/drm/i915/gt/intel_context_types.h |  6 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 65 ++++++++++++++++++-
 drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
 6 files changed, 89 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
index 654a092ed3d6..e2d70a9fdac0 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -576,16 +576,24 @@ void intel_context_bind_parent_child(struct intel_context *parent,
 	child->parallel.parent = parent;
 }
 
-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
+u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
 {
 	u64 total, active;
 
+	if (ce->ops->update_stats)
+		ce->ops->update_stats(ce);
+
 	total = ce->stats.runtime.total;
 	if (ce->ops->flags & COPS_RUNTIME_CYCLES)
 		total *= ce->engine->gt->clock_period_ns;
 
 	active = READ_ONCE(ce->stats.active);
-	if (active)
+	/*
+	 * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the backend
+	 * already provides the total active time of the context, so skip this
+	 * calculation when this flag is set.
+	 */
+	if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
 		active = intel_context_clock() - active;
 
 	return total + active;
diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
index 8e2d70630c49..3d1d7436c1a4 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.h
+++ b/drivers/gpu/drm/i915/gt/intel_context.h
@@ -58,7 +58,7 @@ static inline bool intel_context_is_parent(struct intel_context *ce)
 	return !!ce->parallel.number_children;
 }
 
-static inline bool intel_context_is_pinned(struct intel_context *ce);
+static inline bool intel_context_is_pinned(const struct intel_context *ce);
 
 static inline struct intel_context *
 intel_context_to_parent(struct intel_context *ce)
@@ -118,7 +118,7 @@ static inline int intel_context_lock_pinned(struct intel_context *ce)
  * Returns: true if the context is currently pinned for use by the GPU.
  */
 static inline bool
-intel_context_is_pinned(struct intel_context *ce)
+intel_context_is_pinned(const struct intel_context *ce)
 {
 	return atomic_read(&ce->pin_count);
 }
@@ -362,7 +362,7 @@ intel_context_clear_nopreempt(struct intel_context *ce)
 	clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
 }
 
-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce);
+u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
 u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
 
 static inline u64 intel_context_clock(void)
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
index 04eacae1aca5..f7ff4c7d81c7 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -38,6 +38,9 @@ struct intel_context_ops {
 #define COPS_RUNTIME_CYCLES_BIT 1
 #define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT)
 
+#define COPS_RUNTIME_ACTIVE_TOTAL_BIT 2
+#define COPS_RUNTIME_ACTIVE_TOTAL BIT(COPS_RUNTIME_ACTIVE_TOTAL_BIT)
+
 	int (*alloc)(struct intel_context *ce);
 
 	void (*revoke)(struct intel_context *ce, struct i915_request *rq,
@@ -56,6 +59,8 @@ struct intel_context_ops {
 
 	void (*sched_disable)(struct intel_context *ce);
 
+	void (*update_stats)(struct intel_context *ce);
+
 	void (*reset)(struct intel_context *ce);
 	void (*destroy)(struct kref *kref);
 
@@ -148,6 +153,7 @@ struct intel_context {
 			struct ewma_runtime avg;
 			u64 total;
 			u32 last;
+			u64 start_gt_clk;
 			I915_SELFTEST_DECLARE(u32 num_underflow);
 			I915_SELFTEST_DECLARE(u32 max_underflow);
 		} runtime;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
index 323b055e5db9..c7b54f1631b9 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
@@ -196,6 +196,11 @@ static inline u8 guc_class_to_engine_class(u8 guc_class)
 	return guc_class_engine_class_map[guc_class];
 }
 
+/* Per context engine usage stats: */
+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO	(0x500 / sizeof(u32))
+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO + 1)
+#define PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI + 1)
+
 /* Work item for submitting workloads into work queue of GuC. */
 struct guc_wq_item {
 	u32 header;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 0d17da77e787..c9fefa254a7e 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -378,7 +378,7 @@ static inline void set_context_guc_id_invalid(struct intel_context *ce)
 	ce->guc_id.id = GUC_INVALID_CONTEXT_ID;
 }
 
-static inline struct intel_guc *ce_to_guc(struct intel_context *ce)
+static inline struct intel_guc *ce_to_guc(const struct intel_context *ce)
 {
 	return &ce->engine->gt->uc.guc;
 }
@@ -1376,13 +1376,16 @@ static void __update_guc_busyness_stats(struct intel_guc *guc)
 	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
 }
 
+static void __guc_context_update_clks(struct intel_context *ce);
 static void guc_timestamp_ping(struct work_struct *wrk)
 {
 	struct intel_guc *guc = container_of(wrk, typeof(*guc),
 					     timestamp.work.work);
 	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
 	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_context *ce;
 	intel_wakeref_t wakeref;
+	unsigned long index;
 	int srcu, ret;
 
 	/*
@@ -1396,6 +1399,10 @@ static void guc_timestamp_ping(struct work_struct *wrk)
 	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
 		__update_guc_busyness_stats(guc);
 
+	/* adjust context stats for overflow */
+	xa_for_each(&guc->context_lookup, index, ce)
+		__guc_context_update_clks(ce);
+
 	intel_gt_reset_unlock(gt, srcu);
 
 	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
@@ -1469,6 +1476,56 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
 			 guc->timestamp.ping_delay);
 }
 
+static void __guc_context_update_clks(struct intel_context *ce)
+{
+	struct intel_guc *guc = ce_to_guc(ce);
+	struct intel_gt *gt = ce->engine->gt;
+	u32 *pphwsp, last_switch, engine_id;
+	u64 start_gt_clk, active;
+	unsigned long flags;
+	ktime_t unused;
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	/*
+	 * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context is switched
+	 * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
+	 * relies on GuC and GPU for busyness calculations. Due to this, A
+	 * potential race was highlighted in an earlier review that can lead to
+	 * double accounting of busyness. While the solution to this is a wip,
+	 * busyness is still usable for platforms running GuC submission.
+	 */
+	pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
+	last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
+	engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
+
+	guc_update_pm_timestamp(guc, &unused);
+
+	if (engine_id != 0xffffffff && last_switch) {
+		start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
+		__extend_last_switch(guc, &start_gt_clk, last_switch);
+		active = intel_gt_clock_interval_to_ns(gt, guc->timestamp.gt_stamp - start_gt_clk);
+		WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
+		WRITE_ONCE(ce->stats.active, active);
+	} else {
+		lrc_update_runtime(ce);
+	}
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
+static void guc_context_update_stats(struct intel_context *ce)
+{
+	if (!intel_context_pin_if_active(ce)) {
+		WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
+		WRITE_ONCE(ce->stats.active, 0);
+		return;
+	}
+
+	__guc_context_update_clks(ce);
+	intel_context_unpin(ce);
+}
+
 static inline bool
 submission_disabled(struct intel_guc *guc)
 {
@@ -2723,6 +2780,7 @@ static void guc_context_unpin(struct intel_context *ce)
 {
 	struct intel_guc *guc = ce_to_guc(ce);
 
+	lrc_update_runtime(ce);
 	unpin_guc_id(guc, ce);
 	lrc_unpin(ce);
 
@@ -3344,6 +3402,7 @@ static void remove_from_context(struct i915_request *rq)
 }
 
 static const struct intel_context_ops guc_context_ops = {
+	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
 	.alloc = guc_context_alloc,
 
 	.pre_pin = guc_context_pre_pin,
@@ -3360,6 +3419,8 @@ static const struct intel_context_ops guc_context_ops = {
 
 	.sched_disable = guc_context_sched_disable,
 
+	.update_stats = guc_context_update_stats,
+
 	.reset = lrc_reset,
 	.destroy = guc_context_destroy,
 
@@ -3593,6 +3654,7 @@ static int guc_virtual_context_alloc(struct intel_context *ce)
 }
 
 static const struct intel_context_ops virtual_guc_context_ops = {
+	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
 	.alloc = guc_virtual_context_alloc,
 
 	.pre_pin = guc_virtual_context_pre_pin,
@@ -3608,6 +3670,7 @@ static const struct intel_context_ops virtual_guc_context_ops = {
 	.exit = guc_virtual_context_exit,
 
 	.sched_disable = guc_context_sched_disable,
+	.update_stats = guc_context_update_stats,
 
 	.destroy = guc_context_destroy,
 
diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c
index b09d1d386574..8d81119fff14 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -147,11 +147,7 @@ void i915_drm_client_fdinfo(struct seq_file *m, struct file *f)
 		   PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
 	seq_printf(m, "drm-client-id:\t%u\n", client->id);
 
-	/*
-	 * Temporarily skip showing client engine information with GuC submission till
-	 * fetching engine busyness is implemented in the GuC submission backend
-	 */
-	if (GRAPHICS_VER(i915) < 8 || intel_uc_uses_guc_submission(&i915->gt0.uc))
+	if (GRAPHICS_VER(i915) < 8)
 		return;
 
 	for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)
-- 
2.37.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
@ 2022-06-16 19:08 Nerlige Ramappa, Umesh
  0 siblings, 0 replies; 29+ messages in thread
From: Nerlige Ramappa, Umesh @ 2022-06-16 19:08 UTC (permalink / raw)
  To: intel-gfx

From: John Harrison <John.C.Harrison@Intel.com>

GuC provides engine_id and last_switch_in ticks for an active context in
the pphwsp. The context image provides a 32 bit total ticks which is the
accumulated by the context (a.k.a. context[CTX_TIMESTAMP]). This
information is used to calculate the context busyness as follows:

If the engine_id is valid, then busyness is the sum of accumulated total
ticks and active ticks. Active ticks is calculated with current gt time
as reference.

If engine_id is invalid, busyness is equal to accumulated total ticks.

Since KMD (CPU) retrieves busyness data from 2 sources - GPU and GuC, a
potential race was highlighted in an earlier review that can lead to
double accounting of busyness. While the solution to this is a wip,
busyness is still usable for platforms running GuC submission.

v2: (Tvrtko)
- Use COPS_RUNTIME_ACTIVE_TOTAL
- Add code comment for the race
- Undo local variables initializations

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_context.c       | 12 +++-
 drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
 drivers/gpu/drm/i915/gt/intel_context_types.h |  6 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 63 ++++++++++++++++++-
 drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
 6 files changed, 87 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
index 4070cb5711d8..4a84146710e0 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -576,16 +576,24 @@ void intel_context_bind_parent_child(struct intel_context *parent,
 	child->parallel.parent = parent;
 }
 
-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
+u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
 {
 	u64 total, active;
 
+	if (ce->ops->update_stats)
+		ce->ops->update_stats(ce);
+
 	total = ce->stats.runtime.total;
 	if (ce->ops->flags & COPS_RUNTIME_CYCLES)
 		total *= ce->engine->gt->clock_period_ns;
 
 	active = READ_ONCE(ce->stats.active);
-	if (active)
+	/*
+	 * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the backend
+	 * already provides the total active time of the context, so skip this
+	 * calculation when this flag is set.
+	 */
+	if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
 		active = intel_context_clock() - active;
 
 	return total + active;
diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
index b7d3214d2cdd..5fc7c19ab29b 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.h
+++ b/drivers/gpu/drm/i915/gt/intel_context.h
@@ -56,7 +56,7 @@ static inline bool intel_context_is_parent(struct intel_context *ce)
 	return !!ce->parallel.number_children;
 }
 
-static inline bool intel_context_is_pinned(struct intel_context *ce);
+static inline bool intel_context_is_pinned(const struct intel_context *ce);
 
 static inline struct intel_context *
 intel_context_to_parent(struct intel_context *ce)
@@ -116,7 +116,7 @@ static inline int intel_context_lock_pinned(struct intel_context *ce)
  * Returns: true if the context is currently pinned for use by the GPU.
  */
 static inline bool
-intel_context_is_pinned(struct intel_context *ce)
+intel_context_is_pinned(const struct intel_context *ce)
 {
 	return atomic_read(&ce->pin_count);
 }
@@ -351,7 +351,7 @@ intel_context_clear_nopreempt(struct intel_context *ce)
 	clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
 }
 
-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce);
+u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
 u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
 
 static inline u64 intel_context_clock(void)
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
index 09f82545789f..797bb4242c18 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -38,6 +38,9 @@ struct intel_context_ops {
 #define COPS_RUNTIME_CYCLES_BIT 1
 #define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT)
 
+#define COPS_RUNTIME_ACTIVE_TOTAL_BIT 2
+#define COPS_RUNTIME_ACTIVE_TOTAL BIT(COPS_RUNTIME_ACTIVE_TOTAL_BIT)
+
 	int (*alloc)(struct intel_context *ce);
 
 	void (*ban)(struct intel_context *ce, struct i915_request *rq);
@@ -55,6 +58,8 @@ struct intel_context_ops {
 
 	void (*sched_disable)(struct intel_context *ce);
 
+	void (*update_stats)(struct intel_context *ce);
+
 	void (*reset)(struct intel_context *ce);
 	void (*destroy)(struct kref *kref);
 
@@ -146,6 +151,7 @@ struct intel_context {
 			struct ewma_runtime avg;
 			u64 total;
 			u32 last;
+			u64 start_gt_clk;
 			I915_SELFTEST_DECLARE(u32 num_underflow);
 			I915_SELFTEST_DECLARE(u32 max_underflow);
 		} runtime;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
index b3c9a9327f76..6231ad03e4eb 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
@@ -196,6 +196,11 @@ static inline u8 guc_class_to_engine_class(u8 guc_class)
 	return guc_class_engine_class_map[guc_class];
 }
 
+/* Per context engine usage stats: */
+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO	(0x500 / sizeof(u32))
+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO + 1)
+#define PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI + 1)
+
 /* Work item for submitting workloads into work queue of GuC. */
 struct guc_wq_item {
 	u32 header;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 5a1dfacf24ea..d70a28582049 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -378,7 +378,7 @@ static inline void set_context_guc_id_invalid(struct intel_context *ce)
 	ce->guc_id.id = GUC_INVALID_CONTEXT_ID;
 }
 
-static inline struct intel_guc *ce_to_guc(struct intel_context *ce)
+static inline struct intel_guc *ce_to_guc(const struct intel_context *ce)
 {
 	return &ce->engine->gt->uc.guc;
 }
@@ -1323,13 +1323,16 @@ static void __update_guc_busyness_stats(struct intel_guc *guc)
 	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
 }
 
+static void __guc_context_update_clks(struct intel_context *ce);
 static void guc_timestamp_ping(struct work_struct *wrk)
 {
 	struct intel_guc *guc = container_of(wrk, typeof(*guc),
 					     timestamp.work.work);
 	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
 	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_context *ce;
 	intel_wakeref_t wakeref;
+	unsigned long index;
 	int srcu, ret;
 
 	/*
@@ -1343,6 +1346,10 @@ static void guc_timestamp_ping(struct work_struct *wrk)
 	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
 		__update_guc_busyness_stats(guc);
 
+	/* adjust context stats for overflow */
+	xa_for_each(&guc->context_lookup, index, ce)
+		__guc_context_update_clks(ce);
+
 	intel_gt_reset_unlock(gt, srcu);
 
 	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
@@ -1405,6 +1412,56 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
 			 guc->timestamp.ping_delay);
 }
 
+static void __guc_context_update_clks(struct intel_context *ce)
+{
+	struct intel_guc *guc = ce_to_guc(ce);
+	struct intel_gt *gt = ce->engine->gt;
+	u32 *pphwsp, last_switch, engine_id;
+	u64 start_gt_clk, active;
+	unsigned long flags;
+	ktime_t unused;
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	/*
+	 * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context is switched
+	 * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
+	 * relies on GuC and GPU for busyness calculations. Due to this, A
+	 * potential race was highlighted in an earlier review that can lead to
+	 * double accounting of busyness. While the solution to this is a wip,
+	 * busyness is still usable for platforms running GuC submission.
+	 */
+	pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
+	last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
+	engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
+
+	guc_update_pm_timestamp(guc, &unused);
+
+	if (engine_id != 0xffffffff && last_switch) {
+		start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
+		__extend_last_switch(guc, &start_gt_clk, last_switch);
+		active = intel_gt_clock_interval_to_ns(gt, guc->timestamp.gt_stamp - start_gt_clk);
+		WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
+		WRITE_ONCE(ce->stats.active, active);
+	} else {
+		lrc_update_runtime(ce);
+	}
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
+static void guc_context_update_stats(struct intel_context *ce)
+{
+	if (!intel_context_pin_if_active(ce)) {
+		WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
+		WRITE_ONCE(ce->stats.active, 0);
+		return;
+	}
+
+	__guc_context_update_clks(ce);
+	intel_context_unpin(ce);
+}
+
 static inline bool
 submission_disabled(struct intel_guc *guc)
 {
@@ -2585,6 +2642,7 @@ static void guc_context_unpin(struct intel_context *ce)
 {
 	struct intel_guc *guc = ce_to_guc(ce);
 
+	lrc_update_runtime(ce);
 	unpin_guc_id(guc, ce);
 	lrc_unpin(ce);
 
@@ -3183,6 +3241,7 @@ static void remove_from_context(struct i915_request *rq)
 }
 
 static const struct intel_context_ops guc_context_ops = {
+	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
 	.alloc = guc_context_alloc,
 
 	.pre_pin = guc_context_pre_pin,
@@ -3199,6 +3258,8 @@ static const struct intel_context_ops guc_context_ops = {
 
 	.sched_disable = guc_context_sched_disable,
 
+	.update_stats = guc_context_update_stats,
+
 	.reset = lrc_reset,
 	.destroy = guc_context_destroy,
 
diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c
index 18d38cb59923..118db6f03f15 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -146,11 +146,7 @@ void i915_drm_client_fdinfo(struct seq_file *m, struct file *f)
 		   PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
 	seq_printf(m, "drm-client-id:\t%u\n", client->id);
 
-	/*
-	 * Temporarily skip showing client engine information with GuC submission till
-	 * fetching engine busyness is implemented in the GuC submission backend
-	 */
-	if (GRAPHICS_VER(i915) < 8 || intel_uc_uses_guc_submission(&i915->gt0.uc))
+	if (GRAPHICS_VER(i915) < 8)
 		return;
 
 	for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-06-15  7:08     ` Tvrtko Ursulin
@ 2022-06-15 17:42       ` Umesh Nerlige Ramappa
  2022-08-25  6:18         ` Dixit, Ashutosh
  0 siblings, 1 reply; 29+ messages in thread
From: Umesh Nerlige Ramappa @ 2022-06-15 17:42 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Wed, Jun 15, 2022 at 08:08:40AM +0100, Tvrtko Ursulin wrote:
>
>On 14/06/2022 17:32, Umesh Nerlige Ramappa wrote:
>>On Tue, Jun 14, 2022 at 02:30:42PM +0100, Tvrtko Ursulin wrote:
>>>
>>>On 14/06/2022 01:46, Nerlige Ramappa, Umesh wrote:
>>>>From: John Harrison <John.C.Harrison@Intel.com>
>>>>
>>>>GuC provides engine_id and last_switch_in ticks for an active 
>>>>context in the
>>>>pphwsp. The context image provides a 32 bit total ticks which is 
>>>>the accumulated
>>>>by the context (a.k.a. context[CTX_TIMESTAMP]). This information 
>>>>is used to
>>>>calculate the context busyness as follows:
>>>>
>>>>If the engine_id is valid, then busyness is the sum of 
>>>>accumulated total ticks
>>>>and active ticks. Active ticks is calculated with current gt 
>>>>time as reference.
>>>>
>>>>If engine_id is invalid, busyness is equal to accumulated total ticks.
>>>>
>>>>Since KMD (CPU) retrieves busyness data from 2 sources - GPU and GuC, a
>>>>potential race was highlighted in an earlier review that can 
>>>>lead to double
>>>>accounting of busyness. While the solution to this is a wip, 
>>>>busyness is still
>>>>usable for platforms running GuC submission.
>>>>
>>>>Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>>>---
>>>> drivers/gpu/drm/i915/gt/intel_context.c       | 11 +++-
>>>> drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
>>>> drivers/gpu/drm/i915/gt/intel_context_types.h |  3 +
>>>> drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
>>>> .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 55 ++++++++++++++++++-
>>>> drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
>>>> 6 files changed, 75 insertions(+), 11 deletions(-)
>>>>
>>>>diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
>>>>b/drivers/gpu/drm/i915/gt/intel_context.c
>>>>index 4070cb5711d8..a49f313db911 100644
>>>>--- a/drivers/gpu/drm/i915/gt/intel_context.c
>>>>+++ b/drivers/gpu/drm/i915/gt/intel_context.c
>>>>@@ -576,16 +576,23 @@ void 
>>>>intel_context_bind_parent_child(struct intel_context *parent,
>>>>     child->parallel.parent = parent;
>>>> }
>>>>-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
>>>>+u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>>>> {
>>>>     u64 total, active;
>>>>+    if (ce->ops->update_stats)
>>>>+        ce->ops->update_stats(ce);
>>>>+
>>>>     total = ce->stats.runtime.total;
>>>>     if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>>>>         total *= ce->engine->gt->clock_period_ns;
>>>>     active = READ_ONCE(ce->stats.active);
>>>>-    if (active)
>>>>+    /*
>>>>+     * GuC backend returns the actual time the context was 
>>>>active, so skip
>>>>+     * the calculation here for GuC.
>>>>+     */
>>>>+    if (active && !intel_engine_uses_guc(ce->engine))
>>>
>>>What is the point of looking at ce->stats.active in GuC mode? I 
>>>see that guc_context_update_stats/__guc_context_update_clks 
>>>touches it, but I can't spot that there is a purpose to it. This 
>>>is the only conditional reading it but it is short-circuited in 
>>>GuC case.
>>>
>>>Also, since a GuC only vfunc (update_stats) has been added, I 
>>>wonder why not just fork the whole runtime query 
>>>(ce->get_total_runtime_ns). I think that would end up cleaner.
>>>
>>>>         active = intel_context_clock() - active;
>>>>     return total + active;
>>
>>In case of GuC the active is used directly here since the active 
>>updated in update_stats is equal to the active time of the context 
>>already. I will look into separate vfunc.
>
>Ah right, I misread something. But yes, I think a separate vfunc will 
>look cleaner. Another option (instead of vfunc) is a similar flag to 
>control the express the flavour of active?

Flag does sound simpler. The guc context ops can have something like 
COPS_RUNTIME_ACTIVE_TOTAL that means total active time.

>
>>>>diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
>>>>b/drivers/gpu/drm/i915/gt/intel_context.h
>>>>index b7d3214d2cdd..5fc7c19ab29b 100644
>>>>--- a/drivers/gpu/drm/i915/gt/intel_context.h
>>>>+++ b/drivers/gpu/drm/i915/gt/intel_context.h
>>>>@@ -56,7 +56,7 @@ static inline bool 
>>>>intel_context_is_parent(struct intel_context *ce)
>>>>     return !!ce->parallel.number_children;
>>>> }
>>
>>snip
>>
>>>>+static void __guc_context_update_clks(struct intel_context *ce)
>>>>+{
>>>>+    struct intel_guc *guc = ce_to_guc(ce);
>>>>+    struct intel_gt *gt = ce->engine->gt;
>>>>+    u32 *pphwsp, last_switch, engine_id;
>>>>+    u64 start_gt_clk = 0, active = 0;
>>>
>>>No need to init these two.
>>>
>>>>+    unsigned long flags;
>>>>+    ktime_t unused;
>>>>+
>>>>+    spin_lock_irqsave(&guc->timestamp.lock, flags);
>>>>+
>>>>+    pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
>>>>+    last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
>>>>+    engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
>>>>+
>>>>+    guc_update_pm_timestamp(guc, &unused);
>>>>+
>>>>+    if (engine_id != 0xffffffff && last_switch) {
>>>>+        start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
>>>>+        __extend_last_switch(guc, &start_gt_clk, last_switch);
>>>>+        active = intel_gt_clock_interval_to_ns(gt, 
>>>>guc->timestamp.gt_stamp - start_gt_clk);
>>>>+        WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
>>>>+        WRITE_ONCE(ce->stats.active, active);
>>>>+    } else {
>>>>+        lrc_update_runtime(ce);
>>>
>>>Why is this called from here? Presumably it was called already 
>>>from guc_context_unpin if here code things context is not active. 
>>>Or will be called shortly, once context save is done.
>>
>>guc_context_unpin is only called in the path of ce->sched_disable. 
>>The sched_disable is implemented in GuC (H2G message). Once the 
>>corresponding G2H response is received, the context is actually 
>>unpinned, eventually calling guc_context_unpin. Also the context may 
>>not necessarily be disabled after each context exit.
>
>So if I understand correctly, lrc runtime is only updated if someone 
>is reading the busyness and not as part of normal context state 
>transitions?

If you mean context_in/out events (like csb interrupts), only GuC can 
see those events. KMD has no visibility into that. These 3 paths call 
lrc_update_runtime.

user query: (engine_id != 0xffffffff && last_switch) translates to GuC 
being within context_in and context_out events, so updating it outside 
of this window is one way to report the correct busyness.

worker: guc_timestamp_ping()  also updates context stats (infrequently) 
for all contexts primarily to take care of overflows.

context unpin: Existing code calls lrc_update_runtime only when 
unpinning the context which takes care of accumulating busyness when 
requests are retired.

Thanks,
Umesh

>
>Regards,
>
>Tvrtko

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-06-14 16:32   ` Umesh Nerlige Ramappa
@ 2022-06-15  7:08     ` Tvrtko Ursulin
  2022-06-15 17:42       ` Umesh Nerlige Ramappa
  0 siblings, 1 reply; 29+ messages in thread
From: Tvrtko Ursulin @ 2022-06-15  7:08 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx


On 14/06/2022 17:32, Umesh Nerlige Ramappa wrote:
> On Tue, Jun 14, 2022 at 02:30:42PM +0100, Tvrtko Ursulin wrote:
>>
>> On 14/06/2022 01:46, Nerlige Ramappa, Umesh wrote:
>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>
>>> GuC provides engine_id and last_switch_in ticks for an active context 
>>> in the
>>> pphwsp. The context image provides a 32 bit total ticks which is the 
>>> accumulated
>>> by the context (a.k.a. context[CTX_TIMESTAMP]). This information is 
>>> used to
>>> calculate the context busyness as follows:
>>>
>>> If the engine_id is valid, then busyness is the sum of accumulated 
>>> total ticks
>>> and active ticks. Active ticks is calculated with current gt time as 
>>> reference.
>>>
>>> If engine_id is invalid, busyness is equal to accumulated total ticks.
>>>
>>> Since KMD (CPU) retrieves busyness data from 2 sources - GPU and GuC, a
>>> potential race was highlighted in an earlier review that can lead to 
>>> double
>>> accounting of busyness. While the solution to this is a wip, busyness 
>>> is still
>>> usable for platforms running GuC submission.
>>>
>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>> ---
>>>  drivers/gpu/drm/i915/gt/intel_context.c       | 11 +++-
>>>  drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
>>>  drivers/gpu/drm/i915/gt/intel_context_types.h |  3 +
>>>  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
>>>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 55 ++++++++++++++++++-
>>>  drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
>>>  6 files changed, 75 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
>>> b/drivers/gpu/drm/i915/gt/intel_context.c
>>> index 4070cb5711d8..a49f313db911 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_context.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
>>> @@ -576,16 +576,23 @@ void intel_context_bind_parent_child(struct 
>>> intel_context *parent,
>>>      child->parallel.parent = parent;
>>>  }
>>> -u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
>>> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>>>  {
>>>      u64 total, active;
>>> +    if (ce->ops->update_stats)
>>> +        ce->ops->update_stats(ce);
>>> +
>>>      total = ce->stats.runtime.total;
>>>      if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>>>          total *= ce->engine->gt->clock_period_ns;
>>>      active = READ_ONCE(ce->stats.active);
>>> -    if (active)
>>> +    /*
>>> +     * GuC backend returns the actual time the context was active, 
>>> so skip
>>> +     * the calculation here for GuC.
>>> +     */
>>> +    if (active && !intel_engine_uses_guc(ce->engine))
>>
>> What is the point of looking at ce->stats.active in GuC mode? I see 
>> that guc_context_update_stats/__guc_context_update_clks touches it, 
>> but I can't spot that there is a purpose to it. This is the only 
>> conditional reading it but it is short-circuited in GuC case.
>>
>> Also, since a GuC only vfunc (update_stats) has been added, I wonder 
>> why not just fork the whole runtime query (ce->get_total_runtime_ns). 
>> I think that would end up cleaner.
>>
>>>          active = intel_context_clock() - active;
>>>      return total + active;
> 
> In case of GuC the active is used directly here since the active updated 
> in update_stats is equal to the active time of the context already. I 
> will look into separate vfunc.

Ah right, I misread something. But yes, I think a separate vfunc will 
look cleaner. Another option (instead of vfunc) is a similar flag to 
control the express the flavour of active?

>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
>>> b/drivers/gpu/drm/i915/gt/intel_context.h
>>> index b7d3214d2cdd..5fc7c19ab29b 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_context.h
>>> +++ b/drivers/gpu/drm/i915/gt/intel_context.h
>>> @@ -56,7 +56,7 @@ static inline bool intel_context_is_parent(struct 
>>> intel_context *ce)
>>>      return !!ce->parallel.number_children;
>>>  }
> 
> snip
> 
>>> +static void __guc_context_update_clks(struct intel_context *ce)
>>> +{
>>> +    struct intel_guc *guc = ce_to_guc(ce);
>>> +    struct intel_gt *gt = ce->engine->gt;
>>> +    u32 *pphwsp, last_switch, engine_id;
>>> +    u64 start_gt_clk = 0, active = 0;
>>
>> No need to init these two.
>>
>>> +    unsigned long flags;
>>> +    ktime_t unused;
>>> +
>>> +    spin_lock_irqsave(&guc->timestamp.lock, flags);
>>> +
>>> +    pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
>>> +    last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
>>> +    engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
>>> +
>>> +    guc_update_pm_timestamp(guc, &unused);
>>> +
>>> +    if (engine_id != 0xffffffff && last_switch) {
>>> +        start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
>>> +        __extend_last_switch(guc, &start_gt_clk, last_switch);
>>> +        active = intel_gt_clock_interval_to_ns(gt, 
>>> guc->timestamp.gt_stamp - start_gt_clk);
>>> +        WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
>>> +        WRITE_ONCE(ce->stats.active, active);
>>> +    } else {
>>> +        lrc_update_runtime(ce);
>>
>> Why is this called from here? Presumably it was called already from 
>> guc_context_unpin if here code things context is not active. Or will 
>> be called shortly, once context save is done.
> 
> guc_context_unpin is only called in the path of ce->sched_disable. The 
> sched_disable is implemented in GuC (H2G message). Once the 
> corresponding G2H response is received, the context is actually 
> unpinned, eventually calling guc_context_unpin. Also the context may not 
> necessarily be disabled after each context exit.

So if I understand correctly, lrc runtime is only updated if someone is 
reading the busyness and not as part of normal context state transitions?

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-06-14 13:30 ` Tvrtko Ursulin
@ 2022-06-14 16:32   ` Umesh Nerlige Ramappa
  2022-06-15  7:08     ` Tvrtko Ursulin
  0 siblings, 1 reply; 29+ messages in thread
From: Umesh Nerlige Ramappa @ 2022-06-14 16:32 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Tue, Jun 14, 2022 at 02:30:42PM +0100, Tvrtko Ursulin wrote:
>
>On 14/06/2022 01:46, Nerlige Ramappa, Umesh wrote:
>>From: John Harrison <John.C.Harrison@Intel.com>
>>
>>GuC provides engine_id and last_switch_in ticks for an active context in the
>>pphwsp. The context image provides a 32 bit total ticks which is the accumulated
>>by the context (a.k.a. context[CTX_TIMESTAMP]). This information is used to
>>calculate the context busyness as follows:
>>
>>If the engine_id is valid, then busyness is the sum of accumulated total ticks
>>and active ticks. Active ticks is calculated with current gt time as reference.
>>
>>If engine_id is invalid, busyness is equal to accumulated total ticks.
>>
>>Since KMD (CPU) retrieves busyness data from 2 sources - GPU and GuC, a
>>potential race was highlighted in an earlier review that can lead to double
>>accounting of busyness. While the solution to this is a wip, busyness is still
>>usable for platforms running GuC submission.
>>
>>Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>---
>>  drivers/gpu/drm/i915/gt/intel_context.c       | 11 +++-
>>  drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
>>  drivers/gpu/drm/i915/gt/intel_context_types.h |  3 +
>>  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
>>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 55 ++++++++++++++++++-
>>  drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
>>  6 files changed, 75 insertions(+), 11 deletions(-)
>>
>>diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
>>index 4070cb5711d8..a49f313db911 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_context.c
>>+++ b/drivers/gpu/drm/i915/gt/intel_context.c
>>@@ -576,16 +576,23 @@ void intel_context_bind_parent_child(struct intel_context *parent,
>>  	child->parallel.parent = parent;
>>  }
>>-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
>>+u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>>  {
>>  	u64 total, active;
>>+	if (ce->ops->update_stats)
>>+		ce->ops->update_stats(ce);
>>+
>>  	total = ce->stats.runtime.total;
>>  	if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>>  		total *= ce->engine->gt->clock_period_ns;
>>  	active = READ_ONCE(ce->stats.active);
>>-	if (active)
>>+	/*
>>+	 * GuC backend returns the actual time the context was active, so skip
>>+	 * the calculation here for GuC.
>>+	 */
>>+	if (active && !intel_engine_uses_guc(ce->engine))
>
>What is the point of looking at ce->stats.active in GuC mode? I see 
>that guc_context_update_stats/__guc_context_update_clks touches it, 
>but I can't spot that there is a purpose to it. This is the only 
>conditional reading it but it is short-circuited in GuC case.
>
>Also, since a GuC only vfunc (update_stats) has been added, I wonder 
>why not just fork the whole runtime query (ce->get_total_runtime_ns). 
>I think that would end up cleaner.
>
>>  		active = intel_context_clock() - active;
>>  	return total + active;

In case of GuC the active is used directly here since the active updated 
in update_stats is equal to the active time of the context already. I 
will look into separate vfunc.

>>diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
>>index b7d3214d2cdd..5fc7c19ab29b 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_context.h
>>+++ b/drivers/gpu/drm/i915/gt/intel_context.h
>>@@ -56,7 +56,7 @@ static inline bool intel_context_is_parent(struct intel_context *ce)
>>  	return !!ce->parallel.number_children;
>>  }

snip

>>+static void __guc_context_update_clks(struct intel_context *ce)
>>+{
>>+	struct intel_guc *guc = ce_to_guc(ce);
>>+	struct intel_gt *gt = ce->engine->gt;
>>+	u32 *pphwsp, last_switch, engine_id;
>>+	u64 start_gt_clk = 0, active = 0;
>
>No need to init these two.
>
>>+	unsigned long flags;
>>+	ktime_t unused;
>>+
>>+	spin_lock_irqsave(&guc->timestamp.lock, flags);
>>+
>>+	pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
>>+	last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
>>+	engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
>>+
>>+	guc_update_pm_timestamp(guc, &unused);
>>+
>>+	if (engine_id != 0xffffffff && last_switch) {
>>+		start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
>>+		__extend_last_switch(guc, &start_gt_clk, last_switch);
>>+		active = intel_gt_clock_interval_to_ns(gt, guc->timestamp.gt_stamp - start_gt_clk);
>>+		WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
>>+		WRITE_ONCE(ce->stats.active, active);
>>+	} else {
>>+		lrc_update_runtime(ce);
>
>Why is this called from here? Presumably it was called already from 
>guc_context_unpin if here code things context is not active. Or will 
>be called shortly, once context save is done.

guc_context_unpin is only called in the path of ce->sched_disable. The 
sched_disable is implemented in GuC (H2G message). Once the 
corresponding G2H response is received, the context is actually 
unpinned, eventually calling guc_context_unpin. Also the context may not 
necessarily be disabled after each context exit.

>
>Also, some comments here describing the double accounting race would 
>be good. Or if there are some already in the physical engine code just 
>reference that function.

Will do,

Thanks,
Umesh
>
>Regards,
>
>Tvrtko
>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
  2022-06-14  0:46 Nerlige Ramappa, Umesh
@ 2022-06-14 13:30 ` Tvrtko Ursulin
  2022-06-14 16:32   ` Umesh Nerlige Ramappa
  0 siblings, 1 reply; 29+ messages in thread
From: Tvrtko Ursulin @ 2022-06-14 13:30 UTC (permalink / raw)
  To: Nerlige Ramappa, Umesh, intel-gfx, John.C.Harrison


On 14/06/2022 01:46, Nerlige Ramappa, Umesh wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> GuC provides engine_id and last_switch_in ticks for an active context in the
> pphwsp. The context image provides a 32 bit total ticks which is the accumulated
> by the context (a.k.a. context[CTX_TIMESTAMP]). This information is used to
> calculate the context busyness as follows:
> 
> If the engine_id is valid, then busyness is the sum of accumulated total ticks
> and active ticks. Active ticks is calculated with current gt time as reference.
> 
> If engine_id is invalid, busyness is equal to accumulated total ticks.
> 
> Since KMD (CPU) retrieves busyness data from 2 sources - GPU and GuC, a
> potential race was highlighted in an earlier review that can lead to double
> accounting of busyness. While the solution to this is a wip, busyness is still
> usable for platforms running GuC submission.
> 
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_context.c       | 11 +++-
>   drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
>   drivers/gpu/drm/i915/gt/intel_context_types.h |  3 +
>   drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 55 ++++++++++++++++++-
>   drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
>   6 files changed, 75 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
> index 4070cb5711d8..a49f313db911 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.c
> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> @@ -576,16 +576,23 @@ void intel_context_bind_parent_child(struct intel_context *parent,
>   	child->parallel.parent = parent;
>   }
>   
> -u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>   {
>   	u64 total, active;
>   
> +	if (ce->ops->update_stats)
> +		ce->ops->update_stats(ce);
> +
>   	total = ce->stats.runtime.total;
>   	if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>   		total *= ce->engine->gt->clock_period_ns;
>   
>   	active = READ_ONCE(ce->stats.active);
> -	if (active)
> +	/*
> +	 * GuC backend returns the actual time the context was active, so skip
> +	 * the calculation here for GuC.
> +	 */
> +	if (active && !intel_engine_uses_guc(ce->engine))

What is the point of looking at ce->stats.active in GuC mode? I see that 
guc_context_update_stats/__guc_context_update_clks touches it, but I 
can't spot that there is a purpose to it. This is the only conditional 
reading it but it is short-circuited in GuC case.

Also, since a GuC only vfunc (update_stats) has been added, I wonder why 
not just fork the whole runtime query (ce->get_total_runtime_ns). I 
think that would end up cleaner.

>   		active = intel_context_clock() - active;
>   
>   	return total + active;
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
> index b7d3214d2cdd..5fc7c19ab29b 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context.h
> @@ -56,7 +56,7 @@ static inline bool intel_context_is_parent(struct intel_context *ce)
>   	return !!ce->parallel.number_children;
>   }
>   
> -static inline bool intel_context_is_pinned(struct intel_context *ce);
> +static inline bool intel_context_is_pinned(const struct intel_context *ce);
>   
>   static inline struct intel_context *
>   intel_context_to_parent(struct intel_context *ce)
> @@ -116,7 +116,7 @@ static inline int intel_context_lock_pinned(struct intel_context *ce)
>    * Returns: true if the context is currently pinned for use by the GPU.
>    */
>   static inline bool
> -intel_context_is_pinned(struct intel_context *ce)
> +intel_context_is_pinned(const struct intel_context *ce)
>   {
>   	return atomic_read(&ce->pin_count);
>   }
> @@ -351,7 +351,7 @@ intel_context_clear_nopreempt(struct intel_context *ce)
>   	clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
>   }
>   
> -u64 intel_context_get_total_runtime_ns(const struct intel_context *ce);
> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
>   u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
>   
>   static inline u64 intel_context_clock(void)
> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
> index 09f82545789f..0a3290c99a31 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> @@ -55,6 +55,8 @@ struct intel_context_ops {
>   
>   	void (*sched_disable)(struct intel_context *ce);
>   
> +	void (*update_stats)(struct intel_context *ce);
> +
>   	void (*reset)(struct intel_context *ce);
>   	void (*destroy)(struct kref *kref);
>   
> @@ -146,6 +148,7 @@ struct intel_context {
>   			struct ewma_runtime avg;
>   			u64 total;
>   			u32 last;
> +			u64 start_gt_clk;
>   			I915_SELFTEST_DECLARE(u32 num_underflow);
>   			I915_SELFTEST_DECLARE(u32 max_underflow);
>   		} runtime;
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> index b3c9a9327f76..6231ad03e4eb 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> @@ -196,6 +196,11 @@ static inline u8 guc_class_to_engine_class(u8 guc_class)
>   	return guc_class_engine_class_map[guc_class];
>   }
>   
> +/* Per context engine usage stats: */
> +#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO	(0x500 / sizeof(u32))
> +#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO + 1)
> +#define PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI + 1)
> +
>   /* Work item for submitting workloads into work queue of GuC. */
>   struct guc_wq_item {
>   	u32 header;
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 5a1dfacf24ea..b86401144417 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -378,7 +378,7 @@ static inline void set_context_guc_id_invalid(struct intel_context *ce)
>   	ce->guc_id.id = GUC_INVALID_CONTEXT_ID;
>   }
>   
> -static inline struct intel_guc *ce_to_guc(struct intel_context *ce)
> +static inline struct intel_guc *ce_to_guc(const struct intel_context *ce)
>   {
>   	return &ce->engine->gt->uc.guc;
>   }
> @@ -1323,13 +1323,16 @@ static void __update_guc_busyness_stats(struct intel_guc *guc)
>   	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>   }
>   
> +static void __guc_context_update_clks(struct intel_context *ce);
>   static void guc_timestamp_ping(struct work_struct *wrk)
>   {
>   	struct intel_guc *guc = container_of(wrk, typeof(*guc),
>   					     timestamp.work.work);
>   	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
>   	struct intel_gt *gt = guc_to_gt(guc);
> +	struct intel_context *ce;
>   	intel_wakeref_t wakeref;
> +	unsigned long index;
>   	int srcu, ret;
>   
>   	/*
> @@ -1343,6 +1346,10 @@ static void guc_timestamp_ping(struct work_struct *wrk)
>   	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>   		__update_guc_busyness_stats(guc);
>   
> +	/* adjust context stats for overflow */
> +	xa_for_each(&guc->context_lookup, index, ce)
> +		__guc_context_update_clks(ce);
> +
>   	intel_gt_reset_unlock(gt, srcu);
>   
>   	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
> @@ -1405,6 +1412,48 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
>   			 guc->timestamp.ping_delay);
>   }
>   
> +static void __guc_context_update_clks(struct intel_context *ce)
> +{
> +	struct intel_guc *guc = ce_to_guc(ce);
> +	struct intel_gt *gt = ce->engine->gt;
> +	u32 *pphwsp, last_switch, engine_id;
> +	u64 start_gt_clk = 0, active = 0;

No need to init these two.

> +	unsigned long flags;
> +	ktime_t unused;
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
> +	last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
> +	engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
> +
> +	guc_update_pm_timestamp(guc, &unused);
> +
> +	if (engine_id != 0xffffffff && last_switch) {
> +		start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
> +		__extend_last_switch(guc, &start_gt_clk, last_switch);
> +		active = intel_gt_clock_interval_to_ns(gt, guc->timestamp.gt_stamp - start_gt_clk);
> +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
> +		WRITE_ONCE(ce->stats.active, active);
> +	} else {
> +		lrc_update_runtime(ce);

Why is this called from here? Presumably it was called already from 
guc_context_unpin if here code things context is not active. Or will be 
called shortly, once context save is done.

Also, some comments here describing the double accounting race would be 
good. Or if there are some already in the physical engine code just 
reference that function.

Regards,

Tvrtko

> +	}
> +
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +}
> +
> +static void guc_context_update_stats(struct intel_context *ce)
> +{
> +	if (!intel_context_pin_if_active(ce)) {
> +		WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
> +		WRITE_ONCE(ce->stats.active, 0);
> +		return;
> +	}
> +
> +	__guc_context_update_clks(ce);
> +	intel_context_unpin(ce);
> +}
> +
>   static inline bool
>   submission_disabled(struct intel_guc *guc)
>   {
> @@ -2585,6 +2634,7 @@ static void guc_context_unpin(struct intel_context *ce)
>   {
>   	struct intel_guc *guc = ce_to_guc(ce);
>   
> +	lrc_update_runtime(ce);
>   	unpin_guc_id(guc, ce);
>   	lrc_unpin(ce);
>   
> @@ -3183,6 +3233,7 @@ static void remove_from_context(struct i915_request *rq)
>   }
>   
>   static const struct intel_context_ops guc_context_ops = {
> +	.flags = COPS_RUNTIME_CYCLES,
>   	.alloc = guc_context_alloc,
>   
>   	.pre_pin = guc_context_pre_pin,
> @@ -3199,6 +3250,8 @@ static const struct intel_context_ops guc_context_ops = {
>   
>   	.sched_disable = guc_context_sched_disable,
>   
> +	.update_stats = guc_context_update_stats,
> +
>   	.reset = lrc_reset,
>   	.destroy = guc_context_destroy,
>   
> diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c
> index 18d38cb59923..118db6f03f15 100644
> --- a/drivers/gpu/drm/i915/i915_drm_client.c
> +++ b/drivers/gpu/drm/i915/i915_drm_client.c
> @@ -146,11 +146,7 @@ void i915_drm_client_fdinfo(struct seq_file *m, struct file *f)
>   		   PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
>   	seq_printf(m, "drm-client-id:\t%u\n", client->id);
>   
> -	/*
> -	 * Temporarily skip showing client engine information with GuC submission till
> -	 * fetching engine busyness is implemented in the GuC submission backend
> -	 */
> -	if (GRAPHICS_VER(i915) < 8 || intel_uc_uses_guc_submission(&i915->gt0.uc))
> +	if (GRAPHICS_VER(i915) < 8)
>   		return;
>   
>   	for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
@ 2022-06-14  0:46 Nerlige Ramappa, Umesh
  2022-06-14 13:30 ` Tvrtko Ursulin
  0 siblings, 1 reply; 29+ messages in thread
From: Nerlige Ramappa, Umesh @ 2022-06-14  0:46 UTC (permalink / raw)
  To: intel-gfx, John.C.Harrison

From: John Harrison <John.C.Harrison@Intel.com>

GuC provides engine_id and last_switch_in ticks for an active context in the
pphwsp. The context image provides a 32 bit total ticks which is the accumulated
by the context (a.k.a. context[CTX_TIMESTAMP]). This information is used to
calculate the context busyness as follows:

If the engine_id is valid, then busyness is the sum of accumulated total ticks
and active ticks. Active ticks is calculated with current gt time as reference.

If engine_id is invalid, busyness is equal to accumulated total ticks.

Since KMD (CPU) retrieves busyness data from 2 sources - GPU and GuC, a
potential race was highlighted in an earlier review that can lead to double
accounting of busyness. While the solution to this is a wip, busyness is still
usable for platforms running GuC submission.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_context.c       | 11 +++-
 drivers/gpu/drm/i915/gt/intel_context.h       |  6 +-
 drivers/gpu/drm/i915/gt/intel_context_types.h |  3 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 ++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 55 ++++++++++++++++++-
 drivers/gpu/drm/i915/i915_drm_client.c        |  6 +-
 6 files changed, 75 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
index 4070cb5711d8..a49f313db911 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -576,16 +576,23 @@ void intel_context_bind_parent_child(struct intel_context *parent,
 	child->parallel.parent = parent;
 }
 
-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
+u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
 {
 	u64 total, active;
 
+	if (ce->ops->update_stats)
+		ce->ops->update_stats(ce);
+
 	total = ce->stats.runtime.total;
 	if (ce->ops->flags & COPS_RUNTIME_CYCLES)
 		total *= ce->engine->gt->clock_period_ns;
 
 	active = READ_ONCE(ce->stats.active);
-	if (active)
+	/*
+	 * GuC backend returns the actual time the context was active, so skip
+	 * the calculation here for GuC.
+	 */
+	if (active && !intel_engine_uses_guc(ce->engine))
 		active = intel_context_clock() - active;
 
 	return total + active;
diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
index b7d3214d2cdd..5fc7c19ab29b 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.h
+++ b/drivers/gpu/drm/i915/gt/intel_context.h
@@ -56,7 +56,7 @@ static inline bool intel_context_is_parent(struct intel_context *ce)
 	return !!ce->parallel.number_children;
 }
 
-static inline bool intel_context_is_pinned(struct intel_context *ce);
+static inline bool intel_context_is_pinned(const struct intel_context *ce);
 
 static inline struct intel_context *
 intel_context_to_parent(struct intel_context *ce)
@@ -116,7 +116,7 @@ static inline int intel_context_lock_pinned(struct intel_context *ce)
  * Returns: true if the context is currently pinned for use by the GPU.
  */
 static inline bool
-intel_context_is_pinned(struct intel_context *ce)
+intel_context_is_pinned(const struct intel_context *ce)
 {
 	return atomic_read(&ce->pin_count);
 }
@@ -351,7 +351,7 @@ intel_context_clear_nopreempt(struct intel_context *ce)
 	clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
 }
 
-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce);
+u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
 u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
 
 static inline u64 intel_context_clock(void)
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
index 09f82545789f..0a3290c99a31 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -55,6 +55,8 @@ struct intel_context_ops {
 
 	void (*sched_disable)(struct intel_context *ce);
 
+	void (*update_stats)(struct intel_context *ce);
+
 	void (*reset)(struct intel_context *ce);
 	void (*destroy)(struct kref *kref);
 
@@ -146,6 +148,7 @@ struct intel_context {
 			struct ewma_runtime avg;
 			u64 total;
 			u32 last;
+			u64 start_gt_clk;
 			I915_SELFTEST_DECLARE(u32 num_underflow);
 			I915_SELFTEST_DECLARE(u32 max_underflow);
 		} runtime;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
index b3c9a9327f76..6231ad03e4eb 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
@@ -196,6 +196,11 @@ static inline u8 guc_class_to_engine_class(u8 guc_class)
 	return guc_class_engine_class_map[guc_class];
 }
 
+/* Per context engine usage stats: */
+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO	(0x500 / sizeof(u32))
+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO + 1)
+#define PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID	(PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI + 1)
+
 /* Work item for submitting workloads into work queue of GuC. */
 struct guc_wq_item {
 	u32 header;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 5a1dfacf24ea..b86401144417 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -378,7 +378,7 @@ static inline void set_context_guc_id_invalid(struct intel_context *ce)
 	ce->guc_id.id = GUC_INVALID_CONTEXT_ID;
 }
 
-static inline struct intel_guc *ce_to_guc(struct intel_context *ce)
+static inline struct intel_guc *ce_to_guc(const struct intel_context *ce)
 {
 	return &ce->engine->gt->uc.guc;
 }
@@ -1323,13 +1323,16 @@ static void __update_guc_busyness_stats(struct intel_guc *guc)
 	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
 }
 
+static void __guc_context_update_clks(struct intel_context *ce);
 static void guc_timestamp_ping(struct work_struct *wrk)
 {
 	struct intel_guc *guc = container_of(wrk, typeof(*guc),
 					     timestamp.work.work);
 	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
 	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_context *ce;
 	intel_wakeref_t wakeref;
+	unsigned long index;
 	int srcu, ret;
 
 	/*
@@ -1343,6 +1346,10 @@ static void guc_timestamp_ping(struct work_struct *wrk)
 	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
 		__update_guc_busyness_stats(guc);
 
+	/* adjust context stats for overflow */
+	xa_for_each(&guc->context_lookup, index, ce)
+		__guc_context_update_clks(ce);
+
 	intel_gt_reset_unlock(gt, srcu);
 
 	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
@@ -1405,6 +1412,48 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
 			 guc->timestamp.ping_delay);
 }
 
+static void __guc_context_update_clks(struct intel_context *ce)
+{
+	struct intel_guc *guc = ce_to_guc(ce);
+	struct intel_gt *gt = ce->engine->gt;
+	u32 *pphwsp, last_switch, engine_id;
+	u64 start_gt_clk = 0, active = 0;
+	unsigned long flags;
+	ktime_t unused;
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
+	last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
+	engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
+
+	guc_update_pm_timestamp(guc, &unused);
+
+	if (engine_id != 0xffffffff && last_switch) {
+		start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
+		__extend_last_switch(guc, &start_gt_clk, last_switch);
+		active = intel_gt_clock_interval_to_ns(gt, guc->timestamp.gt_stamp - start_gt_clk);
+		WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
+		WRITE_ONCE(ce->stats.active, active);
+	} else {
+		lrc_update_runtime(ce);
+	}
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
+static void guc_context_update_stats(struct intel_context *ce)
+{
+	if (!intel_context_pin_if_active(ce)) {
+		WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
+		WRITE_ONCE(ce->stats.active, 0);
+		return;
+	}
+
+	__guc_context_update_clks(ce);
+	intel_context_unpin(ce);
+}
+
 static inline bool
 submission_disabled(struct intel_guc *guc)
 {
@@ -2585,6 +2634,7 @@ static void guc_context_unpin(struct intel_context *ce)
 {
 	struct intel_guc *guc = ce_to_guc(ce);
 
+	lrc_update_runtime(ce);
 	unpin_guc_id(guc, ce);
 	lrc_unpin(ce);
 
@@ -3183,6 +3233,7 @@ static void remove_from_context(struct i915_request *rq)
 }
 
 static const struct intel_context_ops guc_context_ops = {
+	.flags = COPS_RUNTIME_CYCLES,
 	.alloc = guc_context_alloc,
 
 	.pre_pin = guc_context_pre_pin,
@@ -3199,6 +3250,8 @@ static const struct intel_context_ops guc_context_ops = {
 
 	.sched_disable = guc_context_sched_disable,
 
+	.update_stats = guc_context_update_stats,
+
 	.reset = lrc_reset,
 	.destroy = guc_context_destroy,
 
diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c
index 18d38cb59923..118db6f03f15 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -146,11 +146,7 @@ void i915_drm_client_fdinfo(struct seq_file *m, struct file *f)
 		   PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
 	seq_printf(m, "drm-client-id:\t%u\n", client->id);
 
-	/*
-	 * Temporarily skip showing client engine information with GuC submission till
-	 * fetching engine busyness is implemented in the GuC submission backend
-	 */
-	if (GRAPHICS_VER(i915) < 8 || intel_uc_uses_guc_submission(&i915->gt0.uc))
+	if (GRAPHICS_VER(i915) < 8)
 		return;
 
 	for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2022-08-31 22:58 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-16 22:13 [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness Nerlige Ramappa, Umesh
2022-06-17  1:31 ` [Intel-gfx] ✗ Fi.CI.SPARSE: warning for i915/pmu: Wire GuC backend to per-client busyness (rev3) Patchwork
2022-06-17  1:51 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
2022-06-17  8:00 ` [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness Tvrtko Ursulin
2022-07-27  6:01   ` Umesh Nerlige Ramappa
2022-07-27  8:48     ` Tvrtko Ursulin
2022-08-01 19:02       ` Umesh Nerlige Ramappa
2022-08-02  8:41         ` Tvrtko Ursulin
2022-08-02 23:38           ` Umesh Nerlige Ramappa
2022-08-04  1:21             ` Umesh Nerlige Ramappa
2022-08-04  7:25               ` Tvrtko Ursulin
2022-06-17 12:14 ` [Intel-gfx] ✗ Fi.CI.IGT: failure for i915/pmu: Wire GuC backend to per-client busyness (rev3) Patchwork
  -- strict thread matches above, loose matches on Subject: below --
2022-08-04 23:21 [Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness Umesh Nerlige Ramappa
2022-08-05  9:45 ` Tvrtko Ursulin
2022-08-05 15:18   ` Umesh Nerlige Ramappa
     [not found]     ` <87fshl3yw0.wl-ashutosh.dixit@intel.com>
2022-08-26 15:44       ` Umesh Nerlige Ramappa
2022-08-25  5:03 ` Dixit, Ashutosh
2022-08-25 21:12   ` Dixit, Ashutosh
2022-08-26  1:44 ` Dixit, Ashutosh
2022-08-26 16:33   ` Umesh Nerlige Ramappa
2022-08-31 20:25     ` Dixit, Ashutosh
2022-08-31 22:57       ` Umesh Nerlige Ramappa
2022-06-16 19:08 Nerlige Ramappa, Umesh
2022-06-14  0:46 Nerlige Ramappa, Umesh
2022-06-14 13:30 ` Tvrtko Ursulin
2022-06-14 16:32   ` Umesh Nerlige Ramappa
2022-06-15  7:08     ` Tvrtko Ursulin
2022-06-15 17:42       ` Umesh Nerlige Ramappa
2022-08-25  6:18         ` Dixit, Ashutosh

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.