All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/i915/pmu: Add a name to the execlists stats
@ 2021-10-15 23:47 ` Umesh Nerlige Ramappa
  0 siblings, 0 replies; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-10-15 23:47 UTC (permalink / raw)
  To: intel-gfx, dri-devel
  Cc: john.c.harrison, Tvrtko Ursulin, daniel.vetter, Matthew Brost

In preparation for GuC pmu stats, add a name to the execlists stats
structure so that it can be differentiated from the GuC stats.

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c    | 14 +++---
 drivers/gpu/drm/i915/gt/intel_engine_stats.h | 33 +++++++------
 drivers/gpu/drm/i915/gt/intel_engine_types.h | 52 +++++++++++---------
 3 files changed, 53 insertions(+), 46 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 2ae57e4656a3..38436f4b5706 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -361,7 +361,7 @@ static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id)
 		DRIVER_CAPS(i915)->has_logical_contexts = true;
 
 	ewma__engine_latency_init(&engine->latency);
-	seqcount_init(&engine->stats.lock);
+	seqcount_init(&engine->stats.execlists.lock);
 
 	ATOMIC_INIT_NOTIFIER_HEAD(&engine->context_status_notifier);
 
@@ -1876,15 +1876,16 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
 					    ktime_t *now)
 {
-	ktime_t total = engine->stats.total;
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
+	ktime_t total = stats->total;
 
 	/*
 	 * If the engine is executing something at the moment
 	 * add it to the total.
 	 */
 	*now = ktime_get();
-	if (READ_ONCE(engine->stats.active))
-		total = ktime_add(total, ktime_sub(*now, engine->stats.start));
+	if (READ_ONCE(stats->active))
+		total = ktime_add(total, ktime_sub(*now, stats->start));
 
 	return total;
 }
@@ -1898,13 +1899,14 @@ static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
  */
 ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now)
 {
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
 	unsigned int seq;
 	ktime_t total;
 
 	do {
-		seq = read_seqcount_begin(&engine->stats.lock);
+		seq = read_seqcount_begin(&stats->lock);
 		total = __intel_engine_get_busy_time(engine, now);
-	} while (read_seqcount_retry(&engine->stats.lock, seq));
+	} while (read_seqcount_retry(&stats->lock, seq));
 
 	return total;
 }
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_stats.h b/drivers/gpu/drm/i915/gt/intel_engine_stats.h
index 24fbdd94351a..8e762d683e50 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_stats.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_stats.h
@@ -15,45 +15,46 @@
 
 static inline void intel_engine_context_in(struct intel_engine_cs *engine)
 {
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
 	unsigned long flags;
 
-	if (engine->stats.active) {
-		engine->stats.active++;
+	if (stats->active) {
+		stats->active++;
 		return;
 	}
 
 	/* The writer is serialised; but the pmu reader may be from hardirq */
 	local_irq_save(flags);
-	write_seqcount_begin(&engine->stats.lock);
+	write_seqcount_begin(&stats->lock);
 
-	engine->stats.start = ktime_get();
-	engine->stats.active++;
+	stats->start = ktime_get();
+	stats->active++;
 
-	write_seqcount_end(&engine->stats.lock);
+	write_seqcount_end(&stats->lock);
 	local_irq_restore(flags);
 
-	GEM_BUG_ON(!engine->stats.active);
+	GEM_BUG_ON(!stats->active);
 }
 
 static inline void intel_engine_context_out(struct intel_engine_cs *engine)
 {
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
 	unsigned long flags;
 
-	GEM_BUG_ON(!engine->stats.active);
-	if (engine->stats.active > 1) {
-		engine->stats.active--;
+	GEM_BUG_ON(!stats->active);
+	if (stats->active > 1) {
+		stats->active--;
 		return;
 	}
 
 	local_irq_save(flags);
-	write_seqcount_begin(&engine->stats.lock);
+	write_seqcount_begin(&stats->lock);
 
-	engine->stats.active--;
-	engine->stats.total =
-		ktime_add(engine->stats.total,
-			  ktime_sub(ktime_get(), engine->stats.start));
+	stats->active--;
+	stats->total = ktime_add(stats->total,
+				 ktime_sub(ktime_get(), stats->start));
 
-	write_seqcount_end(&engine->stats.lock);
+	write_seqcount_end(&stats->lock);
 	local_irq_restore(flags);
 }
 
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 9167ce52487c..b820a2c1124e 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -257,6 +257,33 @@ struct intel_engine_execlists {
 
 #define INTEL_ENGINE_CS_MAX_NAME 8
 
+struct intel_engine_execlists_stats {
+	/**
+	 * @active: Number of contexts currently scheduled in.
+	 */
+	unsigned int active;
+
+	/**
+	 * @lock: Lock protecting the below fields.
+	 */
+	seqcount_t lock;
+
+	/**
+	 * @total: Total time this engine was busy.
+	 *
+	 * Accumulated time not counting the most recent block in cases where
+	 * engine is currently busy (active > 0).
+	 */
+	ktime_t total;
+
+	/**
+	 * @start: Timestamp of the last idle to active transition.
+	 *
+	 * Idle is defined as active == 0, active is active > 0.
+	 */
+	ktime_t start;
+};
+
 struct intel_engine_cs {
 	struct drm_i915_private *i915;
 	struct intel_gt *gt;
@@ -481,30 +508,7 @@ struct intel_engine_cs {
 	u32 (*get_cmd_length_mask)(u32 cmd_header);
 
 	struct {
-		/**
-		 * @active: Number of contexts currently scheduled in.
-		 */
-		unsigned int active;
-
-		/**
-		 * @lock: Lock protecting the below fields.
-		 */
-		seqcount_t lock;
-
-		/**
-		 * @total: Total time this engine was busy.
-		 *
-		 * Accumulated time not counting the most recent block in cases
-		 * where engine is currently busy (active > 0).
-		 */
-		ktime_t total;
-
-		/**
-		 * @start: Timestamp of the last idle to active transition.
-		 *
-		 * Idle is defined as active == 0, active is active > 0.
-		 */
-		ktime_t start;
+		struct intel_engine_execlists_stats execlists;
 
 		/**
 		 * @rps: Utilisation at last RPS sampling.
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [Intel-gfx] [PATCH 1/2] drm/i915/pmu: Add a name to the execlists stats
@ 2021-10-15 23:47 ` Umesh Nerlige Ramappa
  0 siblings, 0 replies; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-10-15 23:47 UTC (permalink / raw)
  To: intel-gfx, dri-devel
  Cc: john.c.harrison, Tvrtko Ursulin, daniel.vetter, Matthew Brost

In preparation for GuC pmu stats, add a name to the execlists stats
structure so that it can be differentiated from the GuC stats.

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c    | 14 +++---
 drivers/gpu/drm/i915/gt/intel_engine_stats.h | 33 +++++++------
 drivers/gpu/drm/i915/gt/intel_engine_types.h | 52 +++++++++++---------
 3 files changed, 53 insertions(+), 46 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 2ae57e4656a3..38436f4b5706 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -361,7 +361,7 @@ static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id)
 		DRIVER_CAPS(i915)->has_logical_contexts = true;
 
 	ewma__engine_latency_init(&engine->latency);
-	seqcount_init(&engine->stats.lock);
+	seqcount_init(&engine->stats.execlists.lock);
 
 	ATOMIC_INIT_NOTIFIER_HEAD(&engine->context_status_notifier);
 
@@ -1876,15 +1876,16 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
 					    ktime_t *now)
 {
-	ktime_t total = engine->stats.total;
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
+	ktime_t total = stats->total;
 
 	/*
 	 * If the engine is executing something at the moment
 	 * add it to the total.
 	 */
 	*now = ktime_get();
-	if (READ_ONCE(engine->stats.active))
-		total = ktime_add(total, ktime_sub(*now, engine->stats.start));
+	if (READ_ONCE(stats->active))
+		total = ktime_add(total, ktime_sub(*now, stats->start));
 
 	return total;
 }
@@ -1898,13 +1899,14 @@ static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
  */
 ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now)
 {
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
 	unsigned int seq;
 	ktime_t total;
 
 	do {
-		seq = read_seqcount_begin(&engine->stats.lock);
+		seq = read_seqcount_begin(&stats->lock);
 		total = __intel_engine_get_busy_time(engine, now);
-	} while (read_seqcount_retry(&engine->stats.lock, seq));
+	} while (read_seqcount_retry(&stats->lock, seq));
 
 	return total;
 }
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_stats.h b/drivers/gpu/drm/i915/gt/intel_engine_stats.h
index 24fbdd94351a..8e762d683e50 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_stats.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_stats.h
@@ -15,45 +15,46 @@
 
 static inline void intel_engine_context_in(struct intel_engine_cs *engine)
 {
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
 	unsigned long flags;
 
-	if (engine->stats.active) {
-		engine->stats.active++;
+	if (stats->active) {
+		stats->active++;
 		return;
 	}
 
 	/* The writer is serialised; but the pmu reader may be from hardirq */
 	local_irq_save(flags);
-	write_seqcount_begin(&engine->stats.lock);
+	write_seqcount_begin(&stats->lock);
 
-	engine->stats.start = ktime_get();
-	engine->stats.active++;
+	stats->start = ktime_get();
+	stats->active++;
 
-	write_seqcount_end(&engine->stats.lock);
+	write_seqcount_end(&stats->lock);
 	local_irq_restore(flags);
 
-	GEM_BUG_ON(!engine->stats.active);
+	GEM_BUG_ON(!stats->active);
 }
 
 static inline void intel_engine_context_out(struct intel_engine_cs *engine)
 {
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
 	unsigned long flags;
 
-	GEM_BUG_ON(!engine->stats.active);
-	if (engine->stats.active > 1) {
-		engine->stats.active--;
+	GEM_BUG_ON(!stats->active);
+	if (stats->active > 1) {
+		stats->active--;
 		return;
 	}
 
 	local_irq_save(flags);
-	write_seqcount_begin(&engine->stats.lock);
+	write_seqcount_begin(&stats->lock);
 
-	engine->stats.active--;
-	engine->stats.total =
-		ktime_add(engine->stats.total,
-			  ktime_sub(ktime_get(), engine->stats.start));
+	stats->active--;
+	stats->total = ktime_add(stats->total,
+				 ktime_sub(ktime_get(), stats->start));
 
-	write_seqcount_end(&engine->stats.lock);
+	write_seqcount_end(&stats->lock);
 	local_irq_restore(flags);
 }
 
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 9167ce52487c..b820a2c1124e 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -257,6 +257,33 @@ struct intel_engine_execlists {
 
 #define INTEL_ENGINE_CS_MAX_NAME 8
 
+struct intel_engine_execlists_stats {
+	/**
+	 * @active: Number of contexts currently scheduled in.
+	 */
+	unsigned int active;
+
+	/**
+	 * @lock: Lock protecting the below fields.
+	 */
+	seqcount_t lock;
+
+	/**
+	 * @total: Total time this engine was busy.
+	 *
+	 * Accumulated time not counting the most recent block in cases where
+	 * engine is currently busy (active > 0).
+	 */
+	ktime_t total;
+
+	/**
+	 * @start: Timestamp of the last idle to active transition.
+	 *
+	 * Idle is defined as active == 0, active is active > 0.
+	 */
+	ktime_t start;
+};
+
 struct intel_engine_cs {
 	struct drm_i915_private *i915;
 	struct intel_gt *gt;
@@ -481,30 +508,7 @@ struct intel_engine_cs {
 	u32 (*get_cmd_length_mask)(u32 cmd_header);
 
 	struct {
-		/**
-		 * @active: Number of contexts currently scheduled in.
-		 */
-		unsigned int active;
-
-		/**
-		 * @lock: Lock protecting the below fields.
-		 */
-		seqcount_t lock;
-
-		/**
-		 * @total: Total time this engine was busy.
-		 *
-		 * Accumulated time not counting the most recent block in cases
-		 * where engine is currently busy (active > 0).
-		 */
-		ktime_t total;
-
-		/**
-		 * @start: Timestamp of the last idle to active transition.
-		 *
-		 * Idle is defined as active == 0, active is active > 0.
-		 */
-		ktime_t start;
+		struct intel_engine_execlists_stats execlists;
 
 		/**
 		 * @rps: Utilisation at last RPS sampling.
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-15 23:47 ` [Intel-gfx] " Umesh Nerlige Ramappa
@ 2021-10-15 23:47   ` Umesh Nerlige Ramappa
  -1 siblings, 0 replies; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-10-15 23:47 UTC (permalink / raw)
  To: intel-gfx, dri-devel
  Cc: john.c.harrison, Tvrtko Ursulin, daniel.vetter, Matthew Brost

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

	engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an overaccounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at guc level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate guc and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of guc state
- Add hooks to gt park/unpark for guc busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to guc initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and guc stats objects
- Since disable_submission is called from many places, move resetting
  stats to intel_guc_submission_reset_prepare

v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
  callbacks and worker with gt reset

v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
  called during i915 load. This ends up calling the guc busyness unpark
  hook and results in kiskstarting an uninitialized worker. Let
  park/unpark hooks check if guc submission has been initialized.
- drop cant_sleep() from trylock hepler since rcu_read_lock takes care
  of that.

v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For guc mode of submission the engine busyness is derived from gt time
  domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
  longer and falls within the busyness tolerances in selftest.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  28 +-
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
 .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
 drivers/gpu/drm/i915/gt/intel_gt_pm.c         |   2 +
 drivers/gpu/drm/i915/gt/intel_reset.c         |  15 +
 drivers/gpu/drm/i915/gt/intel_reset.h         |   1 +
 drivers/gpu/drm/i915/gt/selftest_engine_pm.c  |  21 +-
 .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
 drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  30 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  21 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   5 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 273 ++++++++++++++++++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
 drivers/gpu/drm/i915/i915_reg.h               |   2 +
 15 files changed, 449 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 38436f4b5706..6b783fdcba2a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	intel_engine_print_breadcrumbs(engine, m);
 }
 
-static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
-					    ktime_t *now)
-{
-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
-	ktime_t total = stats->total;
-
-	/*
-	 * If the engine is executing something at the moment
-	 * add it to the total.
-	 */
-	*now = ktime_get();
-	if (READ_ONCE(stats->active))
-		total = ktime_add(total, ktime_sub(*now, stats->start));
-
-	return total;
-}
-
 /**
  * intel_engine_get_busy_time() - Return current accumulated engine busyness
  * @engine: engine to report on
@@ -1899,16 +1882,7 @@ static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
  */
 ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now)
 {
-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
-	unsigned int seq;
-	ktime_t total;
-
-	do {
-		seq = read_seqcount_begin(&stats->lock);
-		total = __intel_engine_get_busy_time(engine, now);
-	} while (read_seqcount_retry(&stats->lock, seq));
-
-	return total;
+	return engine->busyness(engine, now);
 }
 
 struct intel_context *
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index b820a2c1124e..9300c65d6675 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -284,6 +284,28 @@ struct intel_engine_execlists_stats {
 	ktime_t start;
 };
 
+struct intel_engine_guc_stats {
+	/**
+	 * @running: Active state of the engine when busyness was last sampled.
+	 */
+	bool running;
+
+	/**
+	 * @prev_total: Previous value of total runtime clock cycles.
+	 */
+	u32 prev_total;
+
+	/**
+	 * @total_gt_clks: Total gt clock cycles this engine was busy.
+	 */
+	u64 total_gt_clks;
+
+	/**
+	 * @start_gt_clk: GT clock time of last idle to active transition.
+	 */
+	u64 start_gt_clk;
+};
+
 struct intel_engine_cs {
 	struct drm_i915_private *i915;
 	struct intel_gt *gt;
@@ -459,6 +481,12 @@ struct intel_engine_cs {
 	void		(*add_active_request)(struct i915_request *rq);
 	void		(*remove_active_request)(struct i915_request *rq);
 
+	/*
+	 * Get engine busyness and the time at which the busyness was sampled.
+	 */
+	ktime_t		(*busyness)(struct intel_engine_cs *engine,
+				    ktime_t *now);
+
 	struct intel_engine_execlists execlists;
 
 	/*
@@ -508,7 +536,10 @@ struct intel_engine_cs {
 	u32 (*get_cmd_length_mask)(u32 cmd_header);
 
 	struct {
-		struct intel_engine_execlists_stats execlists;
+		union {
+			struct intel_engine_execlists_stats execlists;
+			struct intel_engine_guc_stats guc;
+		};
 
 		/**
 		 * @rps: Utilisation at last RPS sampling.
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 73a79c2acd3a..e8ffcf36f6f4 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -3292,6 +3292,38 @@ static void execlists_release(struct intel_engine_cs *engine)
 	lrc_fini_wa_ctx(engine);
 }
 
+static ktime_t __execlists_engine_busyness(struct intel_engine_cs *engine,
+					   ktime_t *now)
+{
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
+	ktime_t total = stats->total;
+
+	/*
+	 * If the engine is executing something at the moment
+	 * add it to the total.
+	 */
+	*now = ktime_get();
+	if (READ_ONCE(stats->active))
+		total = ktime_add(total, ktime_sub(*now, stats->start));
+
+	return total;
+}
+
+static ktime_t execlists_engine_busyness(struct intel_engine_cs *engine,
+					 ktime_t *now)
+{
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
+	unsigned int seq;
+	ktime_t total;
+
+	do {
+		seq = read_seqcount_begin(&stats->lock);
+		total = __execlists_engine_busyness(engine, now);
+	} while (read_seqcount_retry(&stats->lock, seq));
+
+	return total;
+}
+
 static void
 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 {
@@ -3348,6 +3380,8 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 		engine->emit_bb_start = gen8_emit_bb_start;
 	else
 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
+
+	engine->busyness = execlists_engine_busyness;
 }
 
 static void logical_ring_default_irqs(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 524eaf678790..b4a8594bc46c 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -86,6 +86,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
 	intel_rc6_unpark(&gt->rc6);
 	intel_rps_unpark(&gt->rps);
 	i915_pmu_gt_unparked(i915);
+	intel_guc_busyness_unpark(gt);
 
 	intel_gt_unpark_requests(gt);
 	runtime_begin(gt);
@@ -104,6 +105,7 @@ static int __gt_park(struct intel_wakeref *wf)
 	runtime_end(gt);
 	intel_gt_park_requests(gt);
 
+	intel_guc_busyness_park(gt);
 	i915_vma_parked(gt);
 	i915_pmu_gt_parked(i915);
 	intel_rps_park(&gt->rps);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index 91200c43951f..37b4e6b852a6 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -1389,6 +1389,21 @@ void intel_gt_handle_error(struct intel_gt *gt,
 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
 }
 
+bool intel_gt_reset_trylock_no_wait(struct intel_gt *gt, int *srcu)
+{
+	int reset_in_progress;
+
+	might_lock(&gt->reset.backoff_srcu);
+
+	rcu_read_lock();
+	reset_in_progress = test_bit(I915_RESET_BACKOFF, &gt->reset.flags);
+	if (!reset_in_progress)
+		*srcu = srcu_read_lock(&gt->reset.backoff_srcu);
+	rcu_read_unlock();
+
+	return reset_in_progress;
+}
+
 int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
 {
 	might_lock(&gt->reset.backoff_srcu);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h b/drivers/gpu/drm/i915/gt/intel_reset.h
index adc734e67387..4f5f4c00c54f 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.h
+++ b/drivers/gpu/drm/i915/gt/intel_reset.h
@@ -38,6 +38,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs *engine,
 
 void __i915_request_reset(struct i915_request *rq, bool guilty);
 
+bool __must_check intel_gt_reset_trylock_no_wait(struct intel_gt *gt, int *srcu);
 int __must_check intel_gt_reset_trylock(struct intel_gt *gt, int *srcu);
 void intel_gt_reset_unlock(struct intel_gt *gt, int tag);
 
diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
index 75569666105d..24358bef6691 100644
--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
@@ -234,6 +234,7 @@ static int live_engine_busy_stats(void *arg)
 		struct i915_request *rq;
 		ktime_t de, dt;
 		ktime_t t[2];
+		u32 gt_stamp;
 
 		if (!intel_engine_supports_stats(engine))
 			continue;
@@ -251,10 +252,16 @@ static int live_engine_busy_stats(void *arg)
 		ENGINE_TRACE(engine, "measuring idle time\n");
 		preempt_disable();
 		de = intel_engine_get_busy_time(engine, &t[0]);
-		udelay(100);
+		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
+		udelay(10000);
 		de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
+		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP) - gt_stamp;
 		preempt_enable();
-		dt = ktime_sub(t[1], t[0]);
+
+		dt = intel_engine_uses_guc(engine) ?
+		     intel_gt_clock_interval_to_ns(engine->gt, gt_stamp) :
+		     ktime_sub(t[1], t[0]);
+
 		if (de < 0 || de > 10) {
 			pr_err("%s: reported %lldns [%d%%] busyness while sleeping [for %lldns]\n",
 			       engine->name,
@@ -283,10 +290,16 @@ static int live_engine_busy_stats(void *arg)
 		ENGINE_TRACE(engine, "measuring busy time\n");
 		preempt_disable();
 		de = intel_engine_get_busy_time(engine, &t[0]);
-		udelay(100);
+		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
+		udelay(10000);
 		de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
+		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP) - gt_stamp;
 		preempt_enable();
-		dt = ktime_sub(t[1], t[0]);
+
+		dt = intel_engine_uses_guc(engine) ?
+		     intel_gt_clock_interval_to_ns(engine->gt, gt_stamp) :
+		     ktime_sub(t[1], t[0]);
+
 		if (100 * de < 95 * dt || 95 * de > 100 * dt) {
 			pr_err("%s: reported %lldns [%d%%] busyness while spinning [for %lldns]\n",
 			       engine->name,
diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
index 8ff582222aff..ff1311d4beff 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
@@ -143,6 +143,7 @@ enum intel_guc_action {
 	INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506,
 	INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
 	INTEL_GUC_ACTION_RESET_CLIENT = 0x5507,
+	INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
 	INTEL_GUC_ACTION_LIMIT
 };
 
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index 5dd174babf7a..3c3d48c7d5de 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -104,6 +104,8 @@ struct intel_guc {
 	u32 ads_regset_size;
 	/** @ads_golden_ctxt_size: size of the golden contexts in the ADS */
 	u32 ads_golden_ctxt_size;
+	/** @ads_engine_usage_size: size of engine usage in the ADS */
+	u32 ads_engine_usage_size;
 
 	/** @lrc_desc_pool: object allocated to hold the GuC LRC descriptor pool */
 	struct i915_vma *lrc_desc_pool;
@@ -138,6 +140,34 @@ struct intel_guc {
 
 	/** @send_mutex: used to serialize the intel_guc_send actions */
 	struct mutex send_mutex;
+
+	/**
+	 * @timestamp: GT timestamp object that stores a copy of the timestamp
+	 * and adjusts it for overflow using a worker.
+	 */
+	struct {
+		/**
+		 * @lock: Lock protecting the below fields and the engine stats.
+		 */
+		spinlock_t lock;
+
+		/**
+		 * @gt_stamp: 64 bit extended value of the GT timestamp.
+		 */
+		u64 gt_stamp;
+
+		/**
+		 * @ping_delay: Period for polling the GT timestamp for
+		 * overflow.
+		 */
+		unsigned long ping_delay;
+
+		/**
+		 * @work: Periodic work to adjust GT timestamp, engine and
+		 * context usage for overflows.
+		 */
+		struct delayed_work work;
+	} timestamp;
 };
 
 static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
index 2c6ea64af7ec..ca9ab53999d5 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
@@ -26,6 +26,8 @@
  *      | guc_policies                          |
  *      +---------------------------------------+
  *      | guc_gt_system_info                    |
+ *      +---------------------------------------+
+ *      | guc_engine_usage                      |
  *      +---------------------------------------+ <== static
  *      | guc_mmio_reg[countA] (engine 0.0)     |
  *      | guc_mmio_reg[countB] (engine 0.1)     |
@@ -47,6 +49,7 @@ struct __guc_ads_blob {
 	struct guc_ads ads;
 	struct guc_policies policies;
 	struct guc_gt_system_info system_info;
+	struct guc_engine_usage engine_usage;
 	/* From here on, location is dynamic! Refer to above diagram. */
 	struct guc_mmio_reg regset[0];
 } __packed;
@@ -628,3 +631,21 @@ void intel_guc_ads_reset(struct intel_guc *guc)
 
 	guc_ads_private_data_reset(guc);
 }
+
+u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
+{
+	struct __guc_ads_blob *blob = guc->ads_blob;
+	u32 base = intel_guc_ggtt_offset(guc, guc->ads_vma);
+	u32 offset = base + ptr_offset(blob, engine_usage);
+
+	return offset;
+}
+
+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine)
+{
+	struct intel_guc *guc = &engine->gt->uc.guc;
+	struct __guc_ads_blob *blob = guc->ads_blob;
+	u8 guc_class = engine_class_to_guc_class(engine->class);
+
+	return &blob->engine_usage.engines[guc_class][engine->instance];
+}
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
index 3d85051d57e4..e74c110facff 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
@@ -6,8 +6,11 @@
 #ifndef _INTEL_GUC_ADS_H_
 #define _INTEL_GUC_ADS_H_
 
+#include <linux/types.h>
+
 struct intel_guc;
 struct drm_printer;
+struct intel_engine_cs;
 
 int intel_guc_ads_create(struct intel_guc *guc);
 void intel_guc_ads_destroy(struct intel_guc *guc);
@@ -15,5 +18,7 @@ void intel_guc_ads_init_late(struct intel_guc *guc);
 void intel_guc_ads_reset(struct intel_guc *guc);
 void intel_guc_ads_print_policy_info(struct intel_guc *guc,
 				     struct drm_printer *p);
+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine);
+u32 intel_guc_engine_usage_offset(struct intel_guc *guc);
 
 #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
index fa4be13c8854..7c9c081670fc 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
@@ -294,6 +294,19 @@ struct guc_ads {
 	u32 reserved[15];
 } __packed;
 
+/* Engine usage stats */
+struct guc_engine_usage_record {
+	u32 current_context_index;
+	u32 last_switch_in_stamp;
+	u32 reserved0;
+	u32 total_runtime;
+	u32 reserved1[4];
+} __packed;
+
+struct guc_engine_usage {
+	struct guc_engine_usage_record engines[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
+} __packed;
+
 /* GuC logging structures */
 
 enum guc_log_buffer_type {
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index ba0de35f6323..f0d09feff14e 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -12,6 +12,7 @@
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_clock_utils.h"
 #include "gt/intel_gt_irq.h"
 #include "gt/intel_gt_pm.h"
 #include "gt/intel_gt_requests.h"
@@ -20,6 +21,7 @@
 #include "gt/intel_mocs.h"
 #include "gt/intel_ring.h"
 
+#include "intel_guc_ads.h"
 #include "intel_guc_submission.h"
 
 #include "i915_drv.h"
@@ -750,6 +752,268 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
 	xa_unlock_irqrestore(&guc->context_lookup, flags);
 }
 
+/*
+ * GuC stores busyness stats for each engine at context in/out boundaries. A
+ * context 'in' logs execution start time, 'out' adds in -> out delta to total.
+ * i915/kmd accesses 'start', 'total' and 'context id' from memory shared with
+ * GuC.
+ *
+ * __i915_pmu_event_read samples engine busyness. When sampling, if context id
+ * is valid (!= ~0) and start is non-zero, the engine is considered to be
+ * active. For an active engine total busyness = total + (now - start), where
+ * 'now' is the time at which the busyness is sampled. For inactive engine,
+ * total busyness = total.
+ *
+ * All times are captured from GUCPMTIMESTAMP reg and are in gt clock domain.
+ *
+ * The start and total values provided by GuC are 32 bits and wrap around in a
+ * few minutes. Since perf pmu provides busyness as 64 bit monotonically
+ * increasing ns values, there is a need for this implementation to account for
+ * overflows and extend the GuC provided values to 64 bits before returning
+ * busyness to the user. In order to do that, a worker runs periodically at
+ * frequency = 1/8th the time it takes for the timestamp to wrap (i.e. once in
+ * 27 seconds for a gt clock frequency of 19.2 MHz).
+ */
+
+#define WRAP_TIME_CLKS U32_MAX
+#define POLL_TIME_CLKS (WRAP_TIME_CLKS >> 3)
+
+static void
+__extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start)
+{
+	u32 gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
+	u32 gt_stamp_last = lower_32_bits(guc->timestamp.gt_stamp);
+
+	if (new_start == lower_32_bits(*prev_start))
+		return;
+
+	if (new_start < gt_stamp_last &&
+	    (new_start - gt_stamp_last) <= POLL_TIME_CLKS)
+		gt_stamp_hi++;
+
+	if (new_start > gt_stamp_last &&
+	    (gt_stamp_last - new_start) <= POLL_TIME_CLKS && gt_stamp_hi)
+		gt_stamp_hi--;
+
+	*prev_start = ((u64)gt_stamp_hi << 32) | new_start;
+}
+
+static void guc_update_engine_gt_clks(struct intel_engine_cs *engine)
+{
+	struct guc_engine_usage_record *rec = intel_guc_engine_usage(engine);
+	struct intel_engine_guc_stats *stats = &engine->stats.guc;
+	struct intel_guc *guc = &engine->gt->uc.guc;
+	u32 last_switch = rec->last_switch_in_stamp;
+	u32 ctx_id = rec->current_context_index;
+	u32 total = rec->total_runtime;
+
+	lockdep_assert_held(&guc->timestamp.lock);
+
+	stats->running = ctx_id != ~0U && last_switch;
+	if (stats->running)
+		__extend_last_switch(guc, &stats->start_gt_clk, last_switch);
+
+	/*
+	 * Instead of adjusting the total for overflow, just add the
+	 * difference from previous sample stats->total_gt_clks
+	 */
+	if (total && total != ~0U) {
+		stats->total_gt_clks += (u32)(total - stats->prev_total);
+		stats->prev_total = total;
+	}
+}
+
+static void guc_update_pm_timestamp(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	u32 gt_stamp_now, gt_stamp_hi;
+
+	lockdep_assert_held(&guc->timestamp.lock);
+
+	gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
+	gt_stamp_now = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
+
+	if (gt_stamp_now < lower_32_bits(guc->timestamp.gt_stamp))
+		gt_stamp_hi++;
+
+	guc->timestamp.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_now;
+}
+
+/*
+ * Unlike the execlist mode of submission total and active times are in terms of
+ * gt clocks. The *now parameter is retained to return the cpu time at which the
+ * busyness was sampled.
+ */
+static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
+{
+	struct intel_engine_guc_stats *stats = &engine->stats.guc;
+	struct intel_gt *gt = engine->gt;
+	struct intel_guc *guc = &gt->uc.guc;
+	unsigned long flags;
+	bool reset_in_progress;
+	u64 total;
+	int srcu;
+
+	/*
+	 * If a reset is in progress, we risk reading partially updated
+	 * engine busyness from GuC, so we just use the driver stored
+	 * copy of busyness. Synchronize with gt reset lock to achieve
+	 * this.
+	 */
+	reset_in_progress = intel_gt_reset_trylock_no_wait(gt, &srcu);
+
+	/*
+	 * The order of taking the reset lock first and then the
+	 * timestamp lock is intentional to avoid lock inversion related
+	 * issues.
+	 */
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	*now = ktime_get();
+
+	/*
+	 * The active busyness depends on start_gt_clk and gt_stamp.
+	 * gt_stamp is updated by i915 only when gt is awake and the
+	 * start_gt_clk is derived from GuC state. To get a consistent
+	 * view of activity, we query the GuC state only if gt is awake.
+	 */
+	if (intel_gt_pm_get_if_awake(gt) && !reset_in_progress) {
+		guc_update_engine_gt_clks(engine);
+		guc_update_pm_timestamp(guc);
+		intel_gt_pm_put_async(gt);
+	}
+
+	total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks);
+	if (stats->running) {
+		u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk;
+
+		total += intel_gt_clock_interval_to_ns(gt, clk);
+	}
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+	if (!reset_in_progress)
+		intel_gt_reset_unlock(gt, srcu);
+
+	return ns_to_ktime(total);
+}
+
+static void __reset_guc_busyness_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	unsigned long flags;
+
+	cancel_delayed_work_sync(&guc->timestamp.work);
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	guc_update_pm_timestamp(guc);
+	for_each_engine(engine, gt, id) {
+		guc_update_engine_gt_clks(engine);
+		engine->stats.guc.prev_total = 0;
+	}
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
+static void __update_guc_busyness_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	guc_update_pm_timestamp(guc);
+	for_each_engine(engine, gt, id)
+		guc_update_engine_gt_clks(engine);
+}
+
+static void guc_timestamp_ping(struct work_struct *wrk)
+{
+	struct intel_guc *guc = container_of(wrk, typeof(*guc),
+					     timestamp.work.work);
+	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
+	struct intel_gt *gt = guc_to_gt(guc);
+	intel_wakeref_t wakeref;
+	unsigned long flags;
+	int srcu, ret;
+
+	/*
+	 * Synchronize with gt reset to make sure the worker does not
+	 * corrupt the engine/guc stats.
+	 */
+	ret = intel_gt_reset_trylock(gt, &srcu);
+	if (ret)
+		return;
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
+		__update_guc_busyness_stats(guc);
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+
+	intel_gt_reset_unlock(gt, srcu);
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+}
+
+static int guc_action_enable_usage_stats(struct intel_guc *guc)
+{
+	u32 offset = intel_guc_engine_usage_offset(guc);
+	u32 action[] = {
+		INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF,
+		offset,
+		0,
+	};
+
+	return intel_guc_send(guc, action, ARRAY_SIZE(action));
+}
+
+static void guc_init_engine_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	intel_wakeref_t wakeref;
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+
+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref) {
+		int ret = guc_action_enable_usage_stats(guc);
+
+		if (ret)
+			drm_err(&gt->i915->drm,
+				"Failed to enable usage stats: %d!\n", ret);
+	}
+}
+
+void intel_guc_busyness_park(struct intel_gt *gt)
+{
+	struct intel_guc *guc = &gt->uc.guc;
+	unsigned long flags;
+
+	if (!guc_submission_initialized(guc))
+		return;
+
+	cancel_delayed_work(&guc->timestamp.work);
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+	__update_guc_busyness_stats(guc);
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
+void intel_guc_busyness_unpark(struct intel_gt *gt)
+{
+	struct intel_guc *guc = &gt->uc.guc;
+
+	if (!guc_submission_initialized(guc))
+		return;
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+}
+
 static inline bool
 submission_disabled(struct intel_guc *guc)
 {
@@ -809,6 +1073,7 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc)
 	intel_gt_park_heartbeats(guc_to_gt(guc));
 	disable_submission(guc);
 	guc->interrupts.disable(guc);
+	__reset_guc_busyness_stats(guc);
 
 	/* Flush IRQ handler */
 	spin_lock_irq(&guc_to_gt(guc)->irq_lock);
@@ -1132,6 +1397,7 @@ void intel_guc_submission_reset_finish(struct intel_guc *guc)
  */
 int intel_guc_submission_init(struct intel_guc *guc)
 {
+	struct intel_gt *gt = guc_to_gt(guc);
 	int ret;
 
 	if (guc->lrc_desc_pool)
@@ -1152,6 +1418,10 @@ int intel_guc_submission_init(struct intel_guc *guc)
 	INIT_LIST_HEAD(&guc->guc_id_list);
 	ida_init(&guc->guc_ids);
 
+	spin_lock_init(&guc->timestamp.lock);
+	INIT_DELAYED_WORK(&guc->timestamp.work, guc_timestamp_ping);
+	guc->timestamp.ping_delay = (POLL_TIME_CLKS / gt->clock_frequency + 1) * HZ;
+
 	return 0;
 }
 
@@ -2606,7 +2876,9 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
 		engine->emit_flush = gen12_emit_flush_xcs;
 	}
 	engine->set_default_submission = guc_set_default_submission;
+	engine->busyness = guc_engine_busyness;
 
+	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
 	engine->flags |= I915_ENGINE_HAS_PREEMPTION;
 	engine->flags |= I915_ENGINE_HAS_TIMESLICES;
 
@@ -2705,6 +2977,7 @@ int intel_guc_submission_setup(struct intel_engine_cs *engine)
 void intel_guc_submission_enable(struct intel_guc *guc)
 {
 	guc_init_lrc_mapping(guc);
+	guc_init_engine_stats(guc);
 }
 
 void intel_guc_submission_disable(struct intel_guc *guc)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
index c7ef44fa0c36..5a95a9f0a8e3 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
@@ -28,6 +28,8 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
 void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
 				    struct i915_request *hung_rq,
 				    struct drm_printer *m);
+void intel_guc_busyness_park(struct intel_gt *gt);
+void intel_guc_busyness_unpark(struct intel_gt *gt);
 
 bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
 
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index a897f4abea0c..9aee08425382 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -2664,6 +2664,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   RING_WAIT		(1 << 11) /* gen3+, PRBx_CTL */
 #define   RING_WAIT_SEMAPHORE	(1 << 10) /* gen6+ */
 
+#define GUCPMTIMESTAMP          _MMIO(0xC3E8)
+
 /* There are 16 64-bit CS General Purpose Registers per-engine on Gen8+ */
 #define GEN8_RING_CS_GPR(base, n)	_MMIO((base) + 0x600 + (n) * 8)
 #define GEN8_RING_CS_GPR_UDW(base, n)	_MMIO((base) + 0x600 + (n) * 8 + 4)
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [Intel-gfx] [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
@ 2021-10-15 23:47   ` Umesh Nerlige Ramappa
  0 siblings, 0 replies; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-10-15 23:47 UTC (permalink / raw)
  To: intel-gfx, dri-devel
  Cc: john.c.harrison, Tvrtko Ursulin, daniel.vetter, Matthew Brost

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

	engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an overaccounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at guc level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate guc and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of guc state
- Add hooks to gt park/unpark for guc busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to guc initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and guc stats objects
- Since disable_submission is called from many places, move resetting
  stats to intel_guc_submission_reset_prepare

v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
  callbacks and worker with gt reset

v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
  called during i915 load. This ends up calling the guc busyness unpark
  hook and results in kiskstarting an uninitialized worker. Let
  park/unpark hooks check if guc submission has been initialized.
- drop cant_sleep() from trylock hepler since rcu_read_lock takes care
  of that.

v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For guc mode of submission the engine busyness is derived from gt time
  domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
  longer and falls within the busyness tolerances in selftest.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  28 +-
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
 .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
 drivers/gpu/drm/i915/gt/intel_gt_pm.c         |   2 +
 drivers/gpu/drm/i915/gt/intel_reset.c         |  15 +
 drivers/gpu/drm/i915/gt/intel_reset.h         |   1 +
 drivers/gpu/drm/i915/gt/selftest_engine_pm.c  |  21 +-
 .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
 drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  30 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  21 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   5 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 273 ++++++++++++++++++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
 drivers/gpu/drm/i915/i915_reg.h               |   2 +
 15 files changed, 449 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 38436f4b5706..6b783fdcba2a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	intel_engine_print_breadcrumbs(engine, m);
 }
 
-static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
-					    ktime_t *now)
-{
-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
-	ktime_t total = stats->total;
-
-	/*
-	 * If the engine is executing something at the moment
-	 * add it to the total.
-	 */
-	*now = ktime_get();
-	if (READ_ONCE(stats->active))
-		total = ktime_add(total, ktime_sub(*now, stats->start));
-
-	return total;
-}
-
 /**
  * intel_engine_get_busy_time() - Return current accumulated engine busyness
  * @engine: engine to report on
@@ -1899,16 +1882,7 @@ static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
  */
 ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now)
 {
-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
-	unsigned int seq;
-	ktime_t total;
-
-	do {
-		seq = read_seqcount_begin(&stats->lock);
-		total = __intel_engine_get_busy_time(engine, now);
-	} while (read_seqcount_retry(&stats->lock, seq));
-
-	return total;
+	return engine->busyness(engine, now);
 }
 
 struct intel_context *
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index b820a2c1124e..9300c65d6675 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -284,6 +284,28 @@ struct intel_engine_execlists_stats {
 	ktime_t start;
 };
 
+struct intel_engine_guc_stats {
+	/**
+	 * @running: Active state of the engine when busyness was last sampled.
+	 */
+	bool running;
+
+	/**
+	 * @prev_total: Previous value of total runtime clock cycles.
+	 */
+	u32 prev_total;
+
+	/**
+	 * @total_gt_clks: Total gt clock cycles this engine was busy.
+	 */
+	u64 total_gt_clks;
+
+	/**
+	 * @start_gt_clk: GT clock time of last idle to active transition.
+	 */
+	u64 start_gt_clk;
+};
+
 struct intel_engine_cs {
 	struct drm_i915_private *i915;
 	struct intel_gt *gt;
@@ -459,6 +481,12 @@ struct intel_engine_cs {
 	void		(*add_active_request)(struct i915_request *rq);
 	void		(*remove_active_request)(struct i915_request *rq);
 
+	/*
+	 * Get engine busyness and the time at which the busyness was sampled.
+	 */
+	ktime_t		(*busyness)(struct intel_engine_cs *engine,
+				    ktime_t *now);
+
 	struct intel_engine_execlists execlists;
 
 	/*
@@ -508,7 +536,10 @@ struct intel_engine_cs {
 	u32 (*get_cmd_length_mask)(u32 cmd_header);
 
 	struct {
-		struct intel_engine_execlists_stats execlists;
+		union {
+			struct intel_engine_execlists_stats execlists;
+			struct intel_engine_guc_stats guc;
+		};
 
 		/**
 		 * @rps: Utilisation at last RPS sampling.
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 73a79c2acd3a..e8ffcf36f6f4 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -3292,6 +3292,38 @@ static void execlists_release(struct intel_engine_cs *engine)
 	lrc_fini_wa_ctx(engine);
 }
 
+static ktime_t __execlists_engine_busyness(struct intel_engine_cs *engine,
+					   ktime_t *now)
+{
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
+	ktime_t total = stats->total;
+
+	/*
+	 * If the engine is executing something at the moment
+	 * add it to the total.
+	 */
+	*now = ktime_get();
+	if (READ_ONCE(stats->active))
+		total = ktime_add(total, ktime_sub(*now, stats->start));
+
+	return total;
+}
+
+static ktime_t execlists_engine_busyness(struct intel_engine_cs *engine,
+					 ktime_t *now)
+{
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
+	unsigned int seq;
+	ktime_t total;
+
+	do {
+		seq = read_seqcount_begin(&stats->lock);
+		total = __execlists_engine_busyness(engine, now);
+	} while (read_seqcount_retry(&stats->lock, seq));
+
+	return total;
+}
+
 static void
 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 {
@@ -3348,6 +3380,8 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 		engine->emit_bb_start = gen8_emit_bb_start;
 	else
 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
+
+	engine->busyness = execlists_engine_busyness;
 }
 
 static void logical_ring_default_irqs(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 524eaf678790..b4a8594bc46c 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -86,6 +86,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
 	intel_rc6_unpark(&gt->rc6);
 	intel_rps_unpark(&gt->rps);
 	i915_pmu_gt_unparked(i915);
+	intel_guc_busyness_unpark(gt);
 
 	intel_gt_unpark_requests(gt);
 	runtime_begin(gt);
@@ -104,6 +105,7 @@ static int __gt_park(struct intel_wakeref *wf)
 	runtime_end(gt);
 	intel_gt_park_requests(gt);
 
+	intel_guc_busyness_park(gt);
 	i915_vma_parked(gt);
 	i915_pmu_gt_parked(i915);
 	intel_rps_park(&gt->rps);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index 91200c43951f..37b4e6b852a6 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -1389,6 +1389,21 @@ void intel_gt_handle_error(struct intel_gt *gt,
 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
 }
 
+bool intel_gt_reset_trylock_no_wait(struct intel_gt *gt, int *srcu)
+{
+	int reset_in_progress;
+
+	might_lock(&gt->reset.backoff_srcu);
+
+	rcu_read_lock();
+	reset_in_progress = test_bit(I915_RESET_BACKOFF, &gt->reset.flags);
+	if (!reset_in_progress)
+		*srcu = srcu_read_lock(&gt->reset.backoff_srcu);
+	rcu_read_unlock();
+
+	return reset_in_progress;
+}
+
 int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
 {
 	might_lock(&gt->reset.backoff_srcu);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h b/drivers/gpu/drm/i915/gt/intel_reset.h
index adc734e67387..4f5f4c00c54f 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.h
+++ b/drivers/gpu/drm/i915/gt/intel_reset.h
@@ -38,6 +38,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs *engine,
 
 void __i915_request_reset(struct i915_request *rq, bool guilty);
 
+bool __must_check intel_gt_reset_trylock_no_wait(struct intel_gt *gt, int *srcu);
 int __must_check intel_gt_reset_trylock(struct intel_gt *gt, int *srcu);
 void intel_gt_reset_unlock(struct intel_gt *gt, int tag);
 
diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
index 75569666105d..24358bef6691 100644
--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
@@ -234,6 +234,7 @@ static int live_engine_busy_stats(void *arg)
 		struct i915_request *rq;
 		ktime_t de, dt;
 		ktime_t t[2];
+		u32 gt_stamp;
 
 		if (!intel_engine_supports_stats(engine))
 			continue;
@@ -251,10 +252,16 @@ static int live_engine_busy_stats(void *arg)
 		ENGINE_TRACE(engine, "measuring idle time\n");
 		preempt_disable();
 		de = intel_engine_get_busy_time(engine, &t[0]);
-		udelay(100);
+		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
+		udelay(10000);
 		de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
+		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP) - gt_stamp;
 		preempt_enable();
-		dt = ktime_sub(t[1], t[0]);
+
+		dt = intel_engine_uses_guc(engine) ?
+		     intel_gt_clock_interval_to_ns(engine->gt, gt_stamp) :
+		     ktime_sub(t[1], t[0]);
+
 		if (de < 0 || de > 10) {
 			pr_err("%s: reported %lldns [%d%%] busyness while sleeping [for %lldns]\n",
 			       engine->name,
@@ -283,10 +290,16 @@ static int live_engine_busy_stats(void *arg)
 		ENGINE_TRACE(engine, "measuring busy time\n");
 		preempt_disable();
 		de = intel_engine_get_busy_time(engine, &t[0]);
-		udelay(100);
+		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
+		udelay(10000);
 		de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
+		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP) - gt_stamp;
 		preempt_enable();
-		dt = ktime_sub(t[1], t[0]);
+
+		dt = intel_engine_uses_guc(engine) ?
+		     intel_gt_clock_interval_to_ns(engine->gt, gt_stamp) :
+		     ktime_sub(t[1], t[0]);
+
 		if (100 * de < 95 * dt || 95 * de > 100 * dt) {
 			pr_err("%s: reported %lldns [%d%%] busyness while spinning [for %lldns]\n",
 			       engine->name,
diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
index 8ff582222aff..ff1311d4beff 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
@@ -143,6 +143,7 @@ enum intel_guc_action {
 	INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506,
 	INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
 	INTEL_GUC_ACTION_RESET_CLIENT = 0x5507,
+	INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
 	INTEL_GUC_ACTION_LIMIT
 };
 
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index 5dd174babf7a..3c3d48c7d5de 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -104,6 +104,8 @@ struct intel_guc {
 	u32 ads_regset_size;
 	/** @ads_golden_ctxt_size: size of the golden contexts in the ADS */
 	u32 ads_golden_ctxt_size;
+	/** @ads_engine_usage_size: size of engine usage in the ADS */
+	u32 ads_engine_usage_size;
 
 	/** @lrc_desc_pool: object allocated to hold the GuC LRC descriptor pool */
 	struct i915_vma *lrc_desc_pool;
@@ -138,6 +140,34 @@ struct intel_guc {
 
 	/** @send_mutex: used to serialize the intel_guc_send actions */
 	struct mutex send_mutex;
+
+	/**
+	 * @timestamp: GT timestamp object that stores a copy of the timestamp
+	 * and adjusts it for overflow using a worker.
+	 */
+	struct {
+		/**
+		 * @lock: Lock protecting the below fields and the engine stats.
+		 */
+		spinlock_t lock;
+
+		/**
+		 * @gt_stamp: 64 bit extended value of the GT timestamp.
+		 */
+		u64 gt_stamp;
+
+		/**
+		 * @ping_delay: Period for polling the GT timestamp for
+		 * overflow.
+		 */
+		unsigned long ping_delay;
+
+		/**
+		 * @work: Periodic work to adjust GT timestamp, engine and
+		 * context usage for overflows.
+		 */
+		struct delayed_work work;
+	} timestamp;
 };
 
 static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
index 2c6ea64af7ec..ca9ab53999d5 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
@@ -26,6 +26,8 @@
  *      | guc_policies                          |
  *      +---------------------------------------+
  *      | guc_gt_system_info                    |
+ *      +---------------------------------------+
+ *      | guc_engine_usage                      |
  *      +---------------------------------------+ <== static
  *      | guc_mmio_reg[countA] (engine 0.0)     |
  *      | guc_mmio_reg[countB] (engine 0.1)     |
@@ -47,6 +49,7 @@ struct __guc_ads_blob {
 	struct guc_ads ads;
 	struct guc_policies policies;
 	struct guc_gt_system_info system_info;
+	struct guc_engine_usage engine_usage;
 	/* From here on, location is dynamic! Refer to above diagram. */
 	struct guc_mmio_reg regset[0];
 } __packed;
@@ -628,3 +631,21 @@ void intel_guc_ads_reset(struct intel_guc *guc)
 
 	guc_ads_private_data_reset(guc);
 }
+
+u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
+{
+	struct __guc_ads_blob *blob = guc->ads_blob;
+	u32 base = intel_guc_ggtt_offset(guc, guc->ads_vma);
+	u32 offset = base + ptr_offset(blob, engine_usage);
+
+	return offset;
+}
+
+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine)
+{
+	struct intel_guc *guc = &engine->gt->uc.guc;
+	struct __guc_ads_blob *blob = guc->ads_blob;
+	u8 guc_class = engine_class_to_guc_class(engine->class);
+
+	return &blob->engine_usage.engines[guc_class][engine->instance];
+}
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
index 3d85051d57e4..e74c110facff 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
@@ -6,8 +6,11 @@
 #ifndef _INTEL_GUC_ADS_H_
 #define _INTEL_GUC_ADS_H_
 
+#include <linux/types.h>
+
 struct intel_guc;
 struct drm_printer;
+struct intel_engine_cs;
 
 int intel_guc_ads_create(struct intel_guc *guc);
 void intel_guc_ads_destroy(struct intel_guc *guc);
@@ -15,5 +18,7 @@ void intel_guc_ads_init_late(struct intel_guc *guc);
 void intel_guc_ads_reset(struct intel_guc *guc);
 void intel_guc_ads_print_policy_info(struct intel_guc *guc,
 				     struct drm_printer *p);
+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine);
+u32 intel_guc_engine_usage_offset(struct intel_guc *guc);
 
 #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
index fa4be13c8854..7c9c081670fc 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
@@ -294,6 +294,19 @@ struct guc_ads {
 	u32 reserved[15];
 } __packed;
 
+/* Engine usage stats */
+struct guc_engine_usage_record {
+	u32 current_context_index;
+	u32 last_switch_in_stamp;
+	u32 reserved0;
+	u32 total_runtime;
+	u32 reserved1[4];
+} __packed;
+
+struct guc_engine_usage {
+	struct guc_engine_usage_record engines[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
+} __packed;
+
 /* GuC logging structures */
 
 enum guc_log_buffer_type {
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index ba0de35f6323..f0d09feff14e 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -12,6 +12,7 @@
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_clock_utils.h"
 #include "gt/intel_gt_irq.h"
 #include "gt/intel_gt_pm.h"
 #include "gt/intel_gt_requests.h"
@@ -20,6 +21,7 @@
 #include "gt/intel_mocs.h"
 #include "gt/intel_ring.h"
 
+#include "intel_guc_ads.h"
 #include "intel_guc_submission.h"
 
 #include "i915_drv.h"
@@ -750,6 +752,268 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
 	xa_unlock_irqrestore(&guc->context_lookup, flags);
 }
 
+/*
+ * GuC stores busyness stats for each engine at context in/out boundaries. A
+ * context 'in' logs execution start time, 'out' adds in -> out delta to total.
+ * i915/kmd accesses 'start', 'total' and 'context id' from memory shared with
+ * GuC.
+ *
+ * __i915_pmu_event_read samples engine busyness. When sampling, if context id
+ * is valid (!= ~0) and start is non-zero, the engine is considered to be
+ * active. For an active engine total busyness = total + (now - start), where
+ * 'now' is the time at which the busyness is sampled. For inactive engine,
+ * total busyness = total.
+ *
+ * All times are captured from GUCPMTIMESTAMP reg and are in gt clock domain.
+ *
+ * The start and total values provided by GuC are 32 bits and wrap around in a
+ * few minutes. Since perf pmu provides busyness as 64 bit monotonically
+ * increasing ns values, there is a need for this implementation to account for
+ * overflows and extend the GuC provided values to 64 bits before returning
+ * busyness to the user. In order to do that, a worker runs periodically at
+ * frequency = 1/8th the time it takes for the timestamp to wrap (i.e. once in
+ * 27 seconds for a gt clock frequency of 19.2 MHz).
+ */
+
+#define WRAP_TIME_CLKS U32_MAX
+#define POLL_TIME_CLKS (WRAP_TIME_CLKS >> 3)
+
+static void
+__extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start)
+{
+	u32 gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
+	u32 gt_stamp_last = lower_32_bits(guc->timestamp.gt_stamp);
+
+	if (new_start == lower_32_bits(*prev_start))
+		return;
+
+	if (new_start < gt_stamp_last &&
+	    (new_start - gt_stamp_last) <= POLL_TIME_CLKS)
+		gt_stamp_hi++;
+
+	if (new_start > gt_stamp_last &&
+	    (gt_stamp_last - new_start) <= POLL_TIME_CLKS && gt_stamp_hi)
+		gt_stamp_hi--;
+
+	*prev_start = ((u64)gt_stamp_hi << 32) | new_start;
+}
+
+static void guc_update_engine_gt_clks(struct intel_engine_cs *engine)
+{
+	struct guc_engine_usage_record *rec = intel_guc_engine_usage(engine);
+	struct intel_engine_guc_stats *stats = &engine->stats.guc;
+	struct intel_guc *guc = &engine->gt->uc.guc;
+	u32 last_switch = rec->last_switch_in_stamp;
+	u32 ctx_id = rec->current_context_index;
+	u32 total = rec->total_runtime;
+
+	lockdep_assert_held(&guc->timestamp.lock);
+
+	stats->running = ctx_id != ~0U && last_switch;
+	if (stats->running)
+		__extend_last_switch(guc, &stats->start_gt_clk, last_switch);
+
+	/*
+	 * Instead of adjusting the total for overflow, just add the
+	 * difference from previous sample stats->total_gt_clks
+	 */
+	if (total && total != ~0U) {
+		stats->total_gt_clks += (u32)(total - stats->prev_total);
+		stats->prev_total = total;
+	}
+}
+
+static void guc_update_pm_timestamp(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	u32 gt_stamp_now, gt_stamp_hi;
+
+	lockdep_assert_held(&guc->timestamp.lock);
+
+	gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
+	gt_stamp_now = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
+
+	if (gt_stamp_now < lower_32_bits(guc->timestamp.gt_stamp))
+		gt_stamp_hi++;
+
+	guc->timestamp.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_now;
+}
+
+/*
+ * Unlike the execlist mode of submission total and active times are in terms of
+ * gt clocks. The *now parameter is retained to return the cpu time at which the
+ * busyness was sampled.
+ */
+static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
+{
+	struct intel_engine_guc_stats *stats = &engine->stats.guc;
+	struct intel_gt *gt = engine->gt;
+	struct intel_guc *guc = &gt->uc.guc;
+	unsigned long flags;
+	bool reset_in_progress;
+	u64 total;
+	int srcu;
+
+	/*
+	 * If a reset is in progress, we risk reading partially updated
+	 * engine busyness from GuC, so we just use the driver stored
+	 * copy of busyness. Synchronize with gt reset lock to achieve
+	 * this.
+	 */
+	reset_in_progress = intel_gt_reset_trylock_no_wait(gt, &srcu);
+
+	/*
+	 * The order of taking the reset lock first and then the
+	 * timestamp lock is intentional to avoid lock inversion related
+	 * issues.
+	 */
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	*now = ktime_get();
+
+	/*
+	 * The active busyness depends on start_gt_clk and gt_stamp.
+	 * gt_stamp is updated by i915 only when gt is awake and the
+	 * start_gt_clk is derived from GuC state. To get a consistent
+	 * view of activity, we query the GuC state only if gt is awake.
+	 */
+	if (intel_gt_pm_get_if_awake(gt) && !reset_in_progress) {
+		guc_update_engine_gt_clks(engine);
+		guc_update_pm_timestamp(guc);
+		intel_gt_pm_put_async(gt);
+	}
+
+	total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks);
+	if (stats->running) {
+		u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk;
+
+		total += intel_gt_clock_interval_to_ns(gt, clk);
+	}
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+	if (!reset_in_progress)
+		intel_gt_reset_unlock(gt, srcu);
+
+	return ns_to_ktime(total);
+}
+
+static void __reset_guc_busyness_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	unsigned long flags;
+
+	cancel_delayed_work_sync(&guc->timestamp.work);
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	guc_update_pm_timestamp(guc);
+	for_each_engine(engine, gt, id) {
+		guc_update_engine_gt_clks(engine);
+		engine->stats.guc.prev_total = 0;
+	}
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
+static void __update_guc_busyness_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	guc_update_pm_timestamp(guc);
+	for_each_engine(engine, gt, id)
+		guc_update_engine_gt_clks(engine);
+}
+
+static void guc_timestamp_ping(struct work_struct *wrk)
+{
+	struct intel_guc *guc = container_of(wrk, typeof(*guc),
+					     timestamp.work.work);
+	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
+	struct intel_gt *gt = guc_to_gt(guc);
+	intel_wakeref_t wakeref;
+	unsigned long flags;
+	int srcu, ret;
+
+	/*
+	 * Synchronize with gt reset to make sure the worker does not
+	 * corrupt the engine/guc stats.
+	 */
+	ret = intel_gt_reset_trylock(gt, &srcu);
+	if (ret)
+		return;
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
+		__update_guc_busyness_stats(guc);
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+
+	intel_gt_reset_unlock(gt, srcu);
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+}
+
+static int guc_action_enable_usage_stats(struct intel_guc *guc)
+{
+	u32 offset = intel_guc_engine_usage_offset(guc);
+	u32 action[] = {
+		INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF,
+		offset,
+		0,
+	};
+
+	return intel_guc_send(guc, action, ARRAY_SIZE(action));
+}
+
+static void guc_init_engine_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	intel_wakeref_t wakeref;
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+
+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref) {
+		int ret = guc_action_enable_usage_stats(guc);
+
+		if (ret)
+			drm_err(&gt->i915->drm,
+				"Failed to enable usage stats: %d!\n", ret);
+	}
+}
+
+void intel_guc_busyness_park(struct intel_gt *gt)
+{
+	struct intel_guc *guc = &gt->uc.guc;
+	unsigned long flags;
+
+	if (!guc_submission_initialized(guc))
+		return;
+
+	cancel_delayed_work(&guc->timestamp.work);
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+	__update_guc_busyness_stats(guc);
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
+void intel_guc_busyness_unpark(struct intel_gt *gt)
+{
+	struct intel_guc *guc = &gt->uc.guc;
+
+	if (!guc_submission_initialized(guc))
+		return;
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+}
+
 static inline bool
 submission_disabled(struct intel_guc *guc)
 {
@@ -809,6 +1073,7 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc)
 	intel_gt_park_heartbeats(guc_to_gt(guc));
 	disable_submission(guc);
 	guc->interrupts.disable(guc);
+	__reset_guc_busyness_stats(guc);
 
 	/* Flush IRQ handler */
 	spin_lock_irq(&guc_to_gt(guc)->irq_lock);
@@ -1132,6 +1397,7 @@ void intel_guc_submission_reset_finish(struct intel_guc *guc)
  */
 int intel_guc_submission_init(struct intel_guc *guc)
 {
+	struct intel_gt *gt = guc_to_gt(guc);
 	int ret;
 
 	if (guc->lrc_desc_pool)
@@ -1152,6 +1418,10 @@ int intel_guc_submission_init(struct intel_guc *guc)
 	INIT_LIST_HEAD(&guc->guc_id_list);
 	ida_init(&guc->guc_ids);
 
+	spin_lock_init(&guc->timestamp.lock);
+	INIT_DELAYED_WORK(&guc->timestamp.work, guc_timestamp_ping);
+	guc->timestamp.ping_delay = (POLL_TIME_CLKS / gt->clock_frequency + 1) * HZ;
+
 	return 0;
 }
 
@@ -2606,7 +2876,9 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
 		engine->emit_flush = gen12_emit_flush_xcs;
 	}
 	engine->set_default_submission = guc_set_default_submission;
+	engine->busyness = guc_engine_busyness;
 
+	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
 	engine->flags |= I915_ENGINE_HAS_PREEMPTION;
 	engine->flags |= I915_ENGINE_HAS_TIMESLICES;
 
@@ -2705,6 +2977,7 @@ int intel_guc_submission_setup(struct intel_engine_cs *engine)
 void intel_guc_submission_enable(struct intel_guc *guc)
 {
 	guc_init_lrc_mapping(guc);
+	guc_init_engine_stats(guc);
 }
 
 void intel_guc_submission_disable(struct intel_guc *guc)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
index c7ef44fa0c36..5a95a9f0a8e3 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
@@ -28,6 +28,8 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
 void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
 				    struct i915_request *hung_rq,
 				    struct drm_printer *m);
+void intel_guc_busyness_park(struct intel_gt *gt);
+void intel_guc_busyness_unpark(struct intel_gt *gt);
 
 bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
 
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index a897f4abea0c..9aee08425382 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -2664,6 +2664,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   RING_WAIT		(1 << 11) /* gen3+, PRBx_CTL */
 #define   RING_WAIT_SEMAPHORE	(1 << 10) /* gen6+ */
 
+#define GUCPMTIMESTAMP          _MMIO(0xC3E8)
+
 /* There are 16 64-bit CS General Purpose Registers per-engine on Gen8+ */
 #define GEN8_RING_CS_GPR(base, n)	_MMIO((base) + 0x600 + (n) * 8)
 #define GEN8_RING_CS_GPR_UDW(base, n)	_MMIO((base) + 0x600 + (n) * 8 + 4)
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/2] drm/i915/pmu: Add a name to the execlists stats
  2021-10-15 23:47 ` [Intel-gfx] " Umesh Nerlige Ramappa
  (?)
  (?)
@ 2021-10-16  1:22 ` Patchwork
  -1 siblings, 0 replies; 31+ messages in thread
From: Patchwork @ 2021-10-16  1:22 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/2] drm/i915/pmu: Add a name to the execlists stats
URL   : https://patchwork.freedesktop.org/series/95904/
State : warning

== Summary ==

$ dim checkpatch origin/drm-tip
7d0d1caf5425 drm/i915/pmu: Add a name to the execlists stats
08a443a8135c drm/i915/pmu: Connect engine busyness stats from GuC to pmu
-:325: CHECK:USLEEP_RANGE: usleep_range is preferred over udelay; see Documentation/timers/timers-howto.rst
#325: FILE: drivers/gpu/drm/i915/gt/selftest_engine_pm.c:256:
+		udelay(10000);

-:325: WARNING:LONG_UDELAY: long udelay - prefer mdelay; see arch/arm/include/asm/delay.h
#325: FILE: drivers/gpu/drm/i915/gt/selftest_engine_pm.c:256:
+		udelay(10000);

-:344: CHECK:USLEEP_RANGE: usleep_range is preferred over udelay; see Documentation/timers/timers-howto.rst
#344: FILE: drivers/gpu/drm/i915/gt/selftest_engine_pm.c:294:
+		udelay(10000);

-:344: WARNING:LONG_UDELAY: long udelay - prefer mdelay; see arch/arm/include/asm/delay.h
#344: FILE: drivers/gpu/drm/i915/gt/selftest_engine_pm.c:294:
+		udelay(10000);

total: 0 errors, 2 warnings, 2 checks, 682 lines checked



^ permalink raw reply	[flat|nested] 31+ messages in thread

* [Intel-gfx] ✗ Fi.CI.SPARSE: warning for series starting with [1/2] drm/i915/pmu: Add a name to the execlists stats
  2021-10-15 23:47 ` [Intel-gfx] " Umesh Nerlige Ramappa
                   ` (2 preceding siblings ...)
  (?)
@ 2021-10-16  1:24 ` Patchwork
  -1 siblings, 0 replies; 31+ messages in thread
From: Patchwork @ 2021-10-16  1:24 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/2] drm/i915/pmu: Add a name to the execlists stats
URL   : https://patchwork.freedesktop.org/series/95904/
State : warning

== Summary ==

$ dim sparse --fast origin/drm-tip
Sparse version: v0.6.2
Fast mode used, each commit won't be checked separately.
+drivers/gpu/drm/i915/gt/intel_engine_stats.h:28:9: warning: trying to copy expression type 31
+drivers/gpu/drm/i915/gt/intel_engine_stats.h:28:9: warning: trying to copy expression type 31
+drivers/gpu/drm/i915/gt/intel_engine_stats.h:28:9: warning: trying to copy expression type 31
+drivers/gpu/drm/i915/gt/intel_engine_stats.h:33:9: warning: trying to copy expression type 31
+drivers/gpu/drm/i915/gt/intel_engine_stats.h:33:9: warning: trying to copy expression type 31
+drivers/gpu/drm/i915/gt/intel_engine_stats.h:51:9: warning: trying to copy expression type 31
+drivers/gpu/drm/i915/gt/intel_engine_stats.h:51:9: warning: trying to copy expression type 31
+drivers/gpu/drm/i915/gt/intel_engine_stats.h:51:9: warning: trying to copy expression type 31
+drivers/gpu/drm/i915/gt/intel_engine_stats.h:57:9: warning: trying to copy expression type 31
+drivers/gpu/drm/i915/gt/intel_engine_stats.h:57:9: warning: trying to copy expression type 31
+drivers/gpu/drm/i915/gt/intel_reset.c:1407:5: warning: context imbalance in 'intel_gt_reset_trylock' - different lock contexts for basic block
+./include/linux/rcupdate.h:716:9: warning: context imbalance in 'intel_gt_reset_trylock_no_wait' - different lock contexts for basic block
-O:drivers/gpu/drm/i915/gt/intel_engine_stats.h:27:9: warning: trying to copy expression type 31
-O:drivers/gpu/drm/i915/gt/intel_engine_stats.h:27:9: warning: trying to copy expression type 31
-O:drivers/gpu/drm/i915/gt/intel_engine_stats.h:27:9: warning: trying to copy expression type 31
-O:drivers/gpu/drm/i915/gt/intel_engine_stats.h:32:9: warning: trying to copy expression type 31
-O:drivers/gpu/drm/i915/gt/intel_engine_stats.h:32:9: warning: trying to copy expression type 31
-O:drivers/gpu/drm/i915/gt/intel_engine_stats.h:49:9: warning: trying to copy expression type 31
-O:drivers/gpu/drm/i915/gt/intel_engine_stats.h:49:9: warning: trying to copy expression type 31
-O:drivers/gpu/drm/i915/gt/intel_engine_stats.h:49:9: warning: trying to copy expression type 31
-O:drivers/gpu/drm/i915/gt/intel_engine_stats.h:56:9: warning: trying to copy expression type 31
-O:drivers/gpu/drm/i915/gt/intel_engine_stats.h:56:9: warning: trying to copy expression type 31
-O:drivers/gpu/drm/i915/gt/intel_reset.c:1392:5: warning: context imbalance in 'intel_gt_reset_trylock' - different lock contexts for basic block



^ permalink raw reply	[flat|nested] 31+ messages in thread

* [Intel-gfx] ✗ Fi.CI.BAT: failure for series starting with [1/2] drm/i915/pmu: Add a name to the execlists stats
  2021-10-15 23:47 ` [Intel-gfx] " Umesh Nerlige Ramappa
                   ` (3 preceding siblings ...)
  (?)
@ 2021-10-16  2:06 ` Patchwork
  -1 siblings, 0 replies; 31+ messages in thread
From: Patchwork @ 2021-10-16  2:06 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx

[-- Attachment #1: Type: text/plain, Size: 7001 bytes --]

== Series Details ==

Series: series starting with [1/2] drm/i915/pmu: Add a name to the execlists stats
URL   : https://patchwork.freedesktop.org/series/95904/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_10744 -> Patchwork_21358
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_21358 absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_21358, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_21358/index.html

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_21358:

### IGT changes ###

#### Possible regressions ####

  * igt@i915_selftest@live@gt_engines:
    - fi-rkl-guc:         [PASS][1] -> [INCOMPLETE][2]
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10744/fi-rkl-guc/igt@i915_selftest@live@gt_engines.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_21358/fi-rkl-guc/igt@i915_selftest@live@gt_engines.html

  
Known issues
------------

  Here are the changes found in Patchwork_21358 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@gem_exec_fence@basic-busy@bcs0:
    - fi-kbl-soraka:      NOTRUN -> [SKIP][3] ([fdo#109271]) +9 similar issues
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_21358/fi-kbl-soraka/igt@gem_exec_fence@basic-busy@bcs0.html

  * igt@gem_huc_copy@huc-copy:
    - fi-kbl-soraka:      NOTRUN -> [SKIP][4] ([fdo#109271] / [i915#2190])
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_21358/fi-kbl-soraka/igt@gem_huc_copy@huc-copy.html

  * igt@i915_selftest@live@gt_pm:
    - fi-kbl-soraka:      NOTRUN -> [DMESG-FAIL][5] ([i915#1886] / [i915#2291])
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_21358/fi-kbl-soraka/igt@i915_selftest@live@gt_pm.html

  * igt@kms_chamelium@common-hpd-after-suspend:
    - fi-kbl-soraka:      NOTRUN -> [SKIP][6] ([fdo#109271] / [fdo#111827]) +8 similar issues
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_21358/fi-kbl-soraka/igt@kms_chamelium@common-hpd-after-suspend.html
    - fi-kbl-7500u:       [PASS][7] -> [DMESG-WARN][8] ([i915#2868])
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10744/fi-kbl-7500u/igt@kms_chamelium@common-hpd-after-suspend.html
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_21358/fi-kbl-7500u/igt@kms_chamelium@common-hpd-after-suspend.html

  * igt@kms_flip@basic-flip-vs-dpms@c-dp2:
    - fi-cfl-8109u:       [PASS][9] -> [DMESG-WARN][10] ([i915#165])
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10744/fi-cfl-8109u/igt@kms_flip@basic-flip-vs-dpms@c-dp2.html
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_21358/fi-cfl-8109u/igt@kms_flip@basic-flip-vs-dpms@c-dp2.html

  * igt@kms_pipe_crc_basic@compare-crc-sanitycheck-pipe-b:
    - fi-cfl-8109u:       [PASS][11] -> [DMESG-WARN][12] ([i915#165] / [i915#295]) +26 similar issues
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10744/fi-cfl-8109u/igt@kms_pipe_crc_basic@compare-crc-sanitycheck-pipe-b.html
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_21358/fi-cfl-8109u/igt@kms_pipe_crc_basic@compare-crc-sanitycheck-pipe-b.html

  * igt@kms_pipe_crc_basic@compare-crc-sanitycheck-pipe-d:
    - fi-kbl-soraka:      NOTRUN -> [SKIP][13] ([fdo#109271] / [i915#533])
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_21358/fi-kbl-soraka/igt@kms_pipe_crc_basic@compare-crc-sanitycheck-pipe-d.html

  * igt@prime_vgem@basic-userptr:
    - fi-pnv-d510:        NOTRUN -> [SKIP][14] ([fdo#109271]) +48 similar issues
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_21358/fi-pnv-d510/igt@prime_vgem@basic-userptr.html

  * igt@runner@aborted:
    - fi-bdw-5557u:       NOTRUN -> [FAIL][15] ([i915#1602] / [i915#2029])
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_21358/fi-bdw-5557u/igt@runner@aborted.html
    - fi-rkl-guc:         NOTRUN -> [FAIL][16] ([i915#3928])
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_21358/fi-rkl-guc/igt@runner@aborted.html

  
#### Possible fixes ####

  * igt@gem_exec_parallel@engines@userptr:
    - fi-pnv-d510:        [INCOMPLETE][17] ([i915#299]) -> [PASS][18]
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10744/fi-pnv-d510/igt@gem_exec_parallel@engines@userptr.html
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_21358/fi-pnv-d510/igt@gem_exec_parallel@engines@userptr.html

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109285]: https://bugs.freedesktop.org/show_bug.cgi?id=109285
  [fdo#109315]: https://bugs.freedesktop.org/show_bug.cgi?id=109315
  [fdo#111827]: https://bugs.freedesktop.org/show_bug.cgi?id=111827
  [fdo#112080]: https://bugs.freedesktop.org/show_bug.cgi?id=112080
  [i915#1602]: https://gitlab.freedesktop.org/drm/intel/issues/1602
  [i915#165]: https://gitlab.freedesktop.org/drm/intel/issues/165
  [i915#1886]: https://gitlab.freedesktop.org/drm/intel/issues/1886
  [i915#2029]: https://gitlab.freedesktop.org/drm/intel/issues/2029
  [i915#2190]: https://gitlab.freedesktop.org/drm/intel/issues/2190
  [i915#2291]: https://gitlab.freedesktop.org/drm/intel/issues/2291
  [i915#2868]: https://gitlab.freedesktop.org/drm/intel/issues/2868
  [i915#295]: https://gitlab.freedesktop.org/drm/intel/issues/295
  [i915#299]: https://gitlab.freedesktop.org/drm/intel/issues/299
  [i915#3301]: https://gitlab.freedesktop.org/drm/intel/issues/3301
  [i915#3303]: https://gitlab.freedesktop.org/drm/intel/issues/3303
  [i915#3928]: https://gitlab.freedesktop.org/drm/intel/issues/3928
  [i915#4103]: https://gitlab.freedesktop.org/drm/intel/issues/4103
  [i915#533]: https://gitlab.freedesktop.org/drm/intel/issues/533


Participating hosts (39 -> 36)
------------------------------

  Additional (2): fi-kbl-soraka fi-jsl-1 
  Missing    (5): bat-dg1-6 fi-hsw-4200u bat-dg1-5 fi-bsw-cyan fi-bdw-samus 


Build changes
-------------

  * Linux: CI_DRM_10744 -> Patchwork_21358

  CI-20190529: 20190529
  CI_DRM_10744: dc405215cfabb6f13490cbdceb1f6e831e8a8596 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_6250: 3c2ac88757f0d0ac9450487d314fcaceebc8bc26 @ https://gitlab.freedesktop.org/drm/igt-gpu-tools.git
  Patchwork_21358: 08a443a8135cf9ab19c3373833fe9a09594a06cb @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

08a443a8135c drm/i915/pmu: Connect engine busyness stats from GuC to pmu
7d0d1caf5425 drm/i915/pmu: Add a name to the execlists stats

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_21358/index.html

[-- Attachment #2: Type: text/html, Size: 7854 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-15 23:47   ` [Intel-gfx] " Umesh Nerlige Ramappa
@ 2021-10-18  7:58     ` Tvrtko Ursulin
  -1 siblings, 0 replies; 31+ messages in thread
From: Tvrtko Ursulin @ 2021-10-18  7:58 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa, intel-gfx, dri-devel
  Cc: john.c.harrison, daniel.vetter, Matthew Brost



On 16/10/2021 00:47, Umesh Nerlige Ramappa wrote:
> With GuC handling scheduling, i915 is not aware of the time that a
> context is scheduled in and out of the engine. Since i915 pmu relies on
> this info to provide engine busyness to the user, GuC shares this info
> with i915 for all engines using shared memory. For each engine, this
> info contains:
> 
> - total busyness: total time that the context was running (total)
> - id: id of the running context (id)
> - start timestamp: timestamp when the context started running (start)
> 
> At the time (now) of sampling the engine busyness, if the id is valid
> (!= ~0), and start is non-zero, then the context is considered to be
> active and the engine busyness is calculated using the below equation
> 
> 	engine busyness = total + (now - start)
> 
> All times are obtained from the gt clock base. For inactive contexts,
> engine busyness is just equal to the total.
> 
> The start and total values provided by GuC are 32 bits and wrap around
> in a few minutes. Since perf pmu provides busyness as 64 bit
> monotonically increasing values, there is a need for this implementation
> to account for overflows and extend the time to 64 bits before returning
> busyness to the user. In order to do that, a worker runs periodically at
> frequency = 1/8th the time it takes for the timestamp to wrap. As an
> example, that would be once in 27 seconds for a gt clock frequency of
> 19.2 MHz.
> 
> Note:
> There might be an overaccounting of busyness due to the fact that GuC
> may be updating the total and start values while kmd is reading them.
> (i.e kmd may read the updated total and the stale start). In such a
> case, user may see higher busyness value followed by smaller ones which
> would eventually catch up to the higher value.
> 
> v2: (Tvrtko)
> - Include details in commit message
> - Move intel engine busyness function into execlist code
> - Use union inside engine->stats
> - Use natural type for ping delay jiffies
> - Drop active_work condition checks
> - Use for_each_engine if iterating all engines
> - Drop seq locking, use spinlock at guc level to update engine stats
> - Document worker specific details
> 
> v3: (Tvrtko/Umesh)
> - Demarcate guc and execlist stat objects with comments
> - Document known over-accounting issue in commit
> - Provide a consistent view of guc state
> - Add hooks to gt park/unpark for guc busyness
> - Stop/start worker in gt park/unpark path
> - Drop inline
> - Move spinlock and worker inits to guc initialization
> - Drop helpers that are called only once
> 
> v4: (Tvrtko/Matt/Umesh)
> - Drop addressed opens from commit message
> - Get runtime pm in ping, remove from the park path
> - Use cancel_delayed_work_sync in disable_submission path
> - Update stats during reset prepare
> - Skip ping if reset in progress
> - Explicitly name execlists and guc stats objects
> - Since disable_submission is called from many places, move resetting
>    stats to intel_guc_submission_reset_prepare
> 
> v5: (Tvrtko)
> - Add a trylock helper that does not sleep and synchronize PMU event
>    callbacks and worker with gt reset
> 
> v6: (CI BAT failures)
> - DUTs using execlist submission failed to boot since __gt_unpark is
>    called during i915 load. This ends up calling the guc busyness unpark
>    hook and results in kiskstarting an uninitialized worker. Let
>    park/unpark hooks check if guc submission has been initialized.
> - drop cant_sleep() from trylock hepler since rcu_read_lock takes care
>    of that.
> 
> v7: (CI) Fix igt@i915_selftest@live@gt_engines
> - For guc mode of submission the engine busyness is derived from gt time
>    domain. Use gt time elapsed as reference in the selftest.
> - Increase busyness calculation to 10ms duration to ensure batch runs
>    longer and falls within the busyness tolerances in selftest.

[snip]

>   
> diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
> index 75569666105d..24358bef6691 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
> @@ -234,6 +234,7 @@ static int live_engine_busy_stats(void *arg)
>   		struct i915_request *rq;
>   		ktime_t de, dt;
>   		ktime_t t[2];
> +		u32 gt_stamp;
>   
>   		if (!intel_engine_supports_stats(engine))
>   			continue;
> @@ -251,10 +252,16 @@ static int live_engine_busy_stats(void *arg)
>   		ENGINE_TRACE(engine, "measuring idle time\n");
>   		preempt_disable();
>   		de = intel_engine_get_busy_time(engine, &t[0]);
> -		udelay(100);
> +		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
> +		udelay(10000);
>   		de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
> +		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP) - gt_stamp;
>   		preempt_enable();
> -		dt = ktime_sub(t[1], t[0]);
> +
> +		dt = intel_engine_uses_guc(engine) ?
> +		     intel_gt_clock_interval_to_ns(engine->gt, gt_stamp) :
> +		     ktime_sub(t[1], t[0]);

But this then shows the thing might not work for external callers like 
PMU who have no idea about GUCPMTIMESTAMP and cannot obtain it anyway.

What is the root cause of the failure here, 100us or clock source? Is 
the granularity of GUCPMTIMESTAMP perhaps simply too coarse for 100us 
test period? I forget what frequency it runs at.

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
@ 2021-10-18  7:58     ` Tvrtko Ursulin
  0 siblings, 0 replies; 31+ messages in thread
From: Tvrtko Ursulin @ 2021-10-18  7:58 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa, intel-gfx, dri-devel
  Cc: john.c.harrison, daniel.vetter, Matthew Brost



On 16/10/2021 00:47, Umesh Nerlige Ramappa wrote:
> With GuC handling scheduling, i915 is not aware of the time that a
> context is scheduled in and out of the engine. Since i915 pmu relies on
> this info to provide engine busyness to the user, GuC shares this info
> with i915 for all engines using shared memory. For each engine, this
> info contains:
> 
> - total busyness: total time that the context was running (total)
> - id: id of the running context (id)
> - start timestamp: timestamp when the context started running (start)
> 
> At the time (now) of sampling the engine busyness, if the id is valid
> (!= ~0), and start is non-zero, then the context is considered to be
> active and the engine busyness is calculated using the below equation
> 
> 	engine busyness = total + (now - start)
> 
> All times are obtained from the gt clock base. For inactive contexts,
> engine busyness is just equal to the total.
> 
> The start and total values provided by GuC are 32 bits and wrap around
> in a few minutes. Since perf pmu provides busyness as 64 bit
> monotonically increasing values, there is a need for this implementation
> to account for overflows and extend the time to 64 bits before returning
> busyness to the user. In order to do that, a worker runs periodically at
> frequency = 1/8th the time it takes for the timestamp to wrap. As an
> example, that would be once in 27 seconds for a gt clock frequency of
> 19.2 MHz.
> 
> Note:
> There might be an overaccounting of busyness due to the fact that GuC
> may be updating the total and start values while kmd is reading them.
> (i.e kmd may read the updated total and the stale start). In such a
> case, user may see higher busyness value followed by smaller ones which
> would eventually catch up to the higher value.
> 
> v2: (Tvrtko)
> - Include details in commit message
> - Move intel engine busyness function into execlist code
> - Use union inside engine->stats
> - Use natural type for ping delay jiffies
> - Drop active_work condition checks
> - Use for_each_engine if iterating all engines
> - Drop seq locking, use spinlock at guc level to update engine stats
> - Document worker specific details
> 
> v3: (Tvrtko/Umesh)
> - Demarcate guc and execlist stat objects with comments
> - Document known over-accounting issue in commit
> - Provide a consistent view of guc state
> - Add hooks to gt park/unpark for guc busyness
> - Stop/start worker in gt park/unpark path
> - Drop inline
> - Move spinlock and worker inits to guc initialization
> - Drop helpers that are called only once
> 
> v4: (Tvrtko/Matt/Umesh)
> - Drop addressed opens from commit message
> - Get runtime pm in ping, remove from the park path
> - Use cancel_delayed_work_sync in disable_submission path
> - Update stats during reset prepare
> - Skip ping if reset in progress
> - Explicitly name execlists and guc stats objects
> - Since disable_submission is called from many places, move resetting
>    stats to intel_guc_submission_reset_prepare
> 
> v5: (Tvrtko)
> - Add a trylock helper that does not sleep and synchronize PMU event
>    callbacks and worker with gt reset
> 
> v6: (CI BAT failures)
> - DUTs using execlist submission failed to boot since __gt_unpark is
>    called during i915 load. This ends up calling the guc busyness unpark
>    hook and results in kiskstarting an uninitialized worker. Let
>    park/unpark hooks check if guc submission has been initialized.
> - drop cant_sleep() from trylock hepler since rcu_read_lock takes care
>    of that.
> 
> v7: (CI) Fix igt@i915_selftest@live@gt_engines
> - For guc mode of submission the engine busyness is derived from gt time
>    domain. Use gt time elapsed as reference in the selftest.
> - Increase busyness calculation to 10ms duration to ensure batch runs
>    longer and falls within the busyness tolerances in selftest.

[snip]

>   
> diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
> index 75569666105d..24358bef6691 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
> @@ -234,6 +234,7 @@ static int live_engine_busy_stats(void *arg)
>   		struct i915_request *rq;
>   		ktime_t de, dt;
>   		ktime_t t[2];
> +		u32 gt_stamp;
>   
>   		if (!intel_engine_supports_stats(engine))
>   			continue;
> @@ -251,10 +252,16 @@ static int live_engine_busy_stats(void *arg)
>   		ENGINE_TRACE(engine, "measuring idle time\n");
>   		preempt_disable();
>   		de = intel_engine_get_busy_time(engine, &t[0]);
> -		udelay(100);
> +		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
> +		udelay(10000);
>   		de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
> +		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP) - gt_stamp;
>   		preempt_enable();
> -		dt = ktime_sub(t[1], t[0]);
> +
> +		dt = intel_engine_uses_guc(engine) ?
> +		     intel_gt_clock_interval_to_ns(engine->gt, gt_stamp) :
> +		     ktime_sub(t[1], t[0]);

But this then shows the thing might not work for external callers like 
PMU who have no idea about GUCPMTIMESTAMP and cannot obtain it anyway.

What is the root cause of the failure here, 100us or clock source? Is 
the granularity of GUCPMTIMESTAMP perhaps simply too coarse for 100us 
test period? I forget what frequency it runs at.

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-18  7:58     ` [Intel-gfx] " Tvrtko Ursulin
@ 2021-10-18 18:35       ` Umesh Nerlige Ramappa
  -1 siblings, 0 replies; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-10-18 18:35 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: intel-gfx, dri-devel, john.c.harrison, daniel.vetter, Matthew Brost

On Mon, Oct 18, 2021 at 08:58:01AM +0100, Tvrtko Ursulin wrote:
>
>
>On 16/10/2021 00:47, Umesh Nerlige Ramappa wrote:
>>With GuC handling scheduling, i915 is not aware of the time that a
>>context is scheduled in and out of the engine. Since i915 pmu relies on
>>this info to provide engine busyness to the user, GuC shares this info
>>with i915 for all engines using shared memory. For each engine, this
>>info contains:
>>
>>- total busyness: total time that the context was running (total)
>>- id: id of the running context (id)
>>- start timestamp: timestamp when the context started running (start)
>>
>>At the time (now) of sampling the engine busyness, if the id is valid
>>(!= ~0), and start is non-zero, then the context is considered to be
>>active and the engine busyness is calculated using the below equation
>>
>>	engine busyness = total + (now - start)
>>
>>All times are obtained from the gt clock base. For inactive contexts,
>>engine busyness is just equal to the total.
>>
>>The start and total values provided by GuC are 32 bits and wrap around
>>in a few minutes. Since perf pmu provides busyness as 64 bit
>>monotonically increasing values, there is a need for this implementation
>>to account for overflows and extend the time to 64 bits before returning
>>busyness to the user. In order to do that, a worker runs periodically at
>>frequency = 1/8th the time it takes for the timestamp to wrap. As an
>>example, that would be once in 27 seconds for a gt clock frequency of
>>19.2 MHz.
>>
>>Note:
>>There might be an overaccounting of busyness due to the fact that GuC
>>may be updating the total and start values while kmd is reading them.
>>(i.e kmd may read the updated total and the stale start). In such a
>>case, user may see higher busyness value followed by smaller ones which
>>would eventually catch up to the higher value.
>>
>>v2: (Tvrtko)
>>- Include details in commit message
>>- Move intel engine busyness function into execlist code
>>- Use union inside engine->stats
>>- Use natural type for ping delay jiffies
>>- Drop active_work condition checks
>>- Use for_each_engine if iterating all engines
>>- Drop seq locking, use spinlock at guc level to update engine stats
>>- Document worker specific details
>>
>>v3: (Tvrtko/Umesh)
>>- Demarcate guc and execlist stat objects with comments
>>- Document known over-accounting issue in commit
>>- Provide a consistent view of guc state
>>- Add hooks to gt park/unpark for guc busyness
>>- Stop/start worker in gt park/unpark path
>>- Drop inline
>>- Move spinlock and worker inits to guc initialization
>>- Drop helpers that are called only once
>>
>>v4: (Tvrtko/Matt/Umesh)
>>- Drop addressed opens from commit message
>>- Get runtime pm in ping, remove from the park path
>>- Use cancel_delayed_work_sync in disable_submission path
>>- Update stats during reset prepare
>>- Skip ping if reset in progress
>>- Explicitly name execlists and guc stats objects
>>- Since disable_submission is called from many places, move resetting
>>   stats to intel_guc_submission_reset_prepare
>>
>>v5: (Tvrtko)
>>- Add a trylock helper that does not sleep and synchronize PMU event
>>   callbacks and worker with gt reset
>>
>>v6: (CI BAT failures)
>>- DUTs using execlist submission failed to boot since __gt_unpark is
>>   called during i915 load. This ends up calling the guc busyness unpark
>>   hook and results in kiskstarting an uninitialized worker. Let
>>   park/unpark hooks check if guc submission has been initialized.
>>- drop cant_sleep() from trylock hepler since rcu_read_lock takes care
>>   of that.
>>
>>v7: (CI) Fix igt@i915_selftest@live@gt_engines
>>- For guc mode of submission the engine busyness is derived from gt time
>>   domain. Use gt time elapsed as reference in the selftest.
>>- Increase busyness calculation to 10ms duration to ensure batch runs
>>   longer and falls within the busyness tolerances in selftest.
>
>[snip]
>
>>diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>index 75569666105d..24358bef6691 100644
>>--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>@@ -234,6 +234,7 @@ static int live_engine_busy_stats(void *arg)
>>  		struct i915_request *rq;
>>  		ktime_t de, dt;
>>  		ktime_t t[2];
>>+		u32 gt_stamp;
>>  		if (!intel_engine_supports_stats(engine))
>>  			continue;
>>@@ -251,10 +252,16 @@ static int live_engine_busy_stats(void *arg)
>>  		ENGINE_TRACE(engine, "measuring idle time\n");
>>  		preempt_disable();
>>  		de = intel_engine_get_busy_time(engine, &t[0]);
>>-		udelay(100);
>>+		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
>>+		udelay(10000);
>>  		de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
>>+		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP) - gt_stamp;
>>  		preempt_enable();
>>-		dt = ktime_sub(t[1], t[0]);
>>+
>>+		dt = intel_engine_uses_guc(engine) ?
>>+		     intel_gt_clock_interval_to_ns(engine->gt, gt_stamp) :
>>+		     ktime_sub(t[1], t[0]);
>
>But this then shows the thing might not work for external callers like 
>PMU who have no idea about GUCPMTIMESTAMP and cannot obtain it anyway.
>
>What is the root cause of the failure here, 100us or clock source? Is 
>the granularity of GUCPMTIMESTAMP perhaps simply too coarse for 100us 
>test period? I forget what frequency it runs at.

guc timestamp is ticking at 19.2 MHz in adlp/rkl (where I ran this).

1)
With 100us, often times I see that the batch has not yet started, so I 
get busy time in the range 0 - 60 %. I increased the time such that the 
batch runs long enough to make the scheduling time < 5%.

2)
I did a 100 runs on rkl/adlp. No failures on rkl. On adlp, I saw one in 
25 runs show 93%/94% busyness for rcs0 and fail (expected is 95%). For 
that I tried using the guc timestamp thinking it would provide more 
accuracy. It did in my testing, but CI still failed for rkl-guc (110% 
busyness!!), so now I just think we need to tweak the expected busyness 
for guc.

Is 1) acceptable?

For 2) I am thinking of just changing the expected busyness to 90% plus 
for guc mode OR should we just let it fail occassionally? Thoughts?

Thanks,
Umesh

>
>Regards,
>
>Tvrtko

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
@ 2021-10-18 18:35       ` Umesh Nerlige Ramappa
  0 siblings, 0 replies; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-10-18 18:35 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: intel-gfx, dri-devel, john.c.harrison, daniel.vetter, Matthew Brost

On Mon, Oct 18, 2021 at 08:58:01AM +0100, Tvrtko Ursulin wrote:
>
>
>On 16/10/2021 00:47, Umesh Nerlige Ramappa wrote:
>>With GuC handling scheduling, i915 is not aware of the time that a
>>context is scheduled in and out of the engine. Since i915 pmu relies on
>>this info to provide engine busyness to the user, GuC shares this info
>>with i915 for all engines using shared memory. For each engine, this
>>info contains:
>>
>>- total busyness: total time that the context was running (total)
>>- id: id of the running context (id)
>>- start timestamp: timestamp when the context started running (start)
>>
>>At the time (now) of sampling the engine busyness, if the id is valid
>>(!= ~0), and start is non-zero, then the context is considered to be
>>active and the engine busyness is calculated using the below equation
>>
>>	engine busyness = total + (now - start)
>>
>>All times are obtained from the gt clock base. For inactive contexts,
>>engine busyness is just equal to the total.
>>
>>The start and total values provided by GuC are 32 bits and wrap around
>>in a few minutes. Since perf pmu provides busyness as 64 bit
>>monotonically increasing values, there is a need for this implementation
>>to account for overflows and extend the time to 64 bits before returning
>>busyness to the user. In order to do that, a worker runs periodically at
>>frequency = 1/8th the time it takes for the timestamp to wrap. As an
>>example, that would be once in 27 seconds for a gt clock frequency of
>>19.2 MHz.
>>
>>Note:
>>There might be an overaccounting of busyness due to the fact that GuC
>>may be updating the total and start values while kmd is reading them.
>>(i.e kmd may read the updated total and the stale start). In such a
>>case, user may see higher busyness value followed by smaller ones which
>>would eventually catch up to the higher value.
>>
>>v2: (Tvrtko)
>>- Include details in commit message
>>- Move intel engine busyness function into execlist code
>>- Use union inside engine->stats
>>- Use natural type for ping delay jiffies
>>- Drop active_work condition checks
>>- Use for_each_engine if iterating all engines
>>- Drop seq locking, use spinlock at guc level to update engine stats
>>- Document worker specific details
>>
>>v3: (Tvrtko/Umesh)
>>- Demarcate guc and execlist stat objects with comments
>>- Document known over-accounting issue in commit
>>- Provide a consistent view of guc state
>>- Add hooks to gt park/unpark for guc busyness
>>- Stop/start worker in gt park/unpark path
>>- Drop inline
>>- Move spinlock and worker inits to guc initialization
>>- Drop helpers that are called only once
>>
>>v4: (Tvrtko/Matt/Umesh)
>>- Drop addressed opens from commit message
>>- Get runtime pm in ping, remove from the park path
>>- Use cancel_delayed_work_sync in disable_submission path
>>- Update stats during reset prepare
>>- Skip ping if reset in progress
>>- Explicitly name execlists and guc stats objects
>>- Since disable_submission is called from many places, move resetting
>>   stats to intel_guc_submission_reset_prepare
>>
>>v5: (Tvrtko)
>>- Add a trylock helper that does not sleep and synchronize PMU event
>>   callbacks and worker with gt reset
>>
>>v6: (CI BAT failures)
>>- DUTs using execlist submission failed to boot since __gt_unpark is
>>   called during i915 load. This ends up calling the guc busyness unpark
>>   hook and results in kiskstarting an uninitialized worker. Let
>>   park/unpark hooks check if guc submission has been initialized.
>>- drop cant_sleep() from trylock hepler since rcu_read_lock takes care
>>   of that.
>>
>>v7: (CI) Fix igt@i915_selftest@live@gt_engines
>>- For guc mode of submission the engine busyness is derived from gt time
>>   domain. Use gt time elapsed as reference in the selftest.
>>- Increase busyness calculation to 10ms duration to ensure batch runs
>>   longer and falls within the busyness tolerances in selftest.
>
>[snip]
>
>>diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>index 75569666105d..24358bef6691 100644
>>--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>@@ -234,6 +234,7 @@ static int live_engine_busy_stats(void *arg)
>>  		struct i915_request *rq;
>>  		ktime_t de, dt;
>>  		ktime_t t[2];
>>+		u32 gt_stamp;
>>  		if (!intel_engine_supports_stats(engine))
>>  			continue;
>>@@ -251,10 +252,16 @@ static int live_engine_busy_stats(void *arg)
>>  		ENGINE_TRACE(engine, "measuring idle time\n");
>>  		preempt_disable();
>>  		de = intel_engine_get_busy_time(engine, &t[0]);
>>-		udelay(100);
>>+		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
>>+		udelay(10000);
>>  		de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
>>+		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP) - gt_stamp;
>>  		preempt_enable();
>>-		dt = ktime_sub(t[1], t[0]);
>>+
>>+		dt = intel_engine_uses_guc(engine) ?
>>+		     intel_gt_clock_interval_to_ns(engine->gt, gt_stamp) :
>>+		     ktime_sub(t[1], t[0]);
>
>But this then shows the thing might not work for external callers like 
>PMU who have no idea about GUCPMTIMESTAMP and cannot obtain it anyway.
>
>What is the root cause of the failure here, 100us or clock source? Is 
>the granularity of GUCPMTIMESTAMP perhaps simply too coarse for 100us 
>test period? I forget what frequency it runs at.

guc timestamp is ticking at 19.2 MHz in adlp/rkl (where I ran this).

1)
With 100us, often times I see that the batch has not yet started, so I 
get busy time in the range 0 - 60 %. I increased the time such that the 
batch runs long enough to make the scheduling time < 5%.

2)
I did a 100 runs on rkl/adlp. No failures on rkl. On adlp, I saw one in 
25 runs show 93%/94% busyness for rcs0 and fail (expected is 95%). For 
that I tried using the guc timestamp thinking it would provide more 
accuracy. It did in my testing, but CI still failed for rkl-guc (110% 
busyness!!), so now I just think we need to tweak the expected busyness 
for guc.

Is 1) acceptable?

For 2) I am thinking of just changing the expected busyness to 90% plus 
for guc mode OR should we just let it fail occassionally? Thoughts?

Thanks,
Umesh

>
>Regards,
>
>Tvrtko

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-18 18:35       ` [Intel-gfx] " Umesh Nerlige Ramappa
@ 2021-10-18 20:35         ` Umesh Nerlige Ramappa
  -1 siblings, 0 replies; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-10-18 20:35 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: intel-gfx, dri-devel, john.c.harrison, daniel.vetter, Matthew Brost

On Mon, Oct 18, 2021 at 11:35:44AM -0700, Umesh Nerlige Ramappa wrote:
>On Mon, Oct 18, 2021 at 08:58:01AM +0100, Tvrtko Ursulin wrote:
>>
>>
>>On 16/10/2021 00:47, Umesh Nerlige Ramappa wrote:
>>>With GuC handling scheduling, i915 is not aware of the time that a
>>>context is scheduled in and out of the engine. Since i915 pmu relies on
>>>this info to provide engine busyness to the user, GuC shares this info
>>>with i915 for all engines using shared memory. For each engine, this
>>>info contains:
>>>
>>>- total busyness: total time that the context was running (total)
>>>- id: id of the running context (id)
>>>- start timestamp: timestamp when the context started running (start)
>>>
>>>At the time (now) of sampling the engine busyness, if the id is valid
>>>(!= ~0), and start is non-zero, then the context is considered to be
>>>active and the engine busyness is calculated using the below equation
>>>
>>>	engine busyness = total + (now - start)
>>>
>>>All times are obtained from the gt clock base. For inactive contexts,
>>>engine busyness is just equal to the total.
>>>
>>>The start and total values provided by GuC are 32 bits and wrap around
>>>in a few minutes. Since perf pmu provides busyness as 64 bit
>>>monotonically increasing values, there is a need for this implementation
>>>to account for overflows and extend the time to 64 bits before returning
>>>busyness to the user. In order to do that, a worker runs periodically at
>>>frequency = 1/8th the time it takes for the timestamp to wrap. As an
>>>example, that would be once in 27 seconds for a gt clock frequency of
>>>19.2 MHz.
>>>
>>>Note:
>>>There might be an overaccounting of busyness due to the fact that GuC
>>>may be updating the total and start values while kmd is reading them.
>>>(i.e kmd may read the updated total and the stale start). In such a
>>>case, user may see higher busyness value followed by smaller ones which
>>>would eventually catch up to the higher value.
>>>
>>>v2: (Tvrtko)
>>>- Include details in commit message
>>>- Move intel engine busyness function into execlist code
>>>- Use union inside engine->stats
>>>- Use natural type for ping delay jiffies
>>>- Drop active_work condition checks
>>>- Use for_each_engine if iterating all engines
>>>- Drop seq locking, use spinlock at guc level to update engine stats
>>>- Document worker specific details
>>>
>>>v3: (Tvrtko/Umesh)
>>>- Demarcate guc and execlist stat objects with comments
>>>- Document known over-accounting issue in commit
>>>- Provide a consistent view of guc state
>>>- Add hooks to gt park/unpark for guc busyness
>>>- Stop/start worker in gt park/unpark path
>>>- Drop inline
>>>- Move spinlock and worker inits to guc initialization
>>>- Drop helpers that are called only once
>>>
>>>v4: (Tvrtko/Matt/Umesh)
>>>- Drop addressed opens from commit message
>>>- Get runtime pm in ping, remove from the park path
>>>- Use cancel_delayed_work_sync in disable_submission path
>>>- Update stats during reset prepare
>>>- Skip ping if reset in progress
>>>- Explicitly name execlists and guc stats objects
>>>- Since disable_submission is called from many places, move resetting
>>>  stats to intel_guc_submission_reset_prepare
>>>
>>>v5: (Tvrtko)
>>>- Add a trylock helper that does not sleep and synchronize PMU event
>>>  callbacks and worker with gt reset
>>>
>>>v6: (CI BAT failures)
>>>- DUTs using execlist submission failed to boot since __gt_unpark is
>>>  called during i915 load. This ends up calling the guc busyness unpark
>>>  hook and results in kiskstarting an uninitialized worker. Let
>>>  park/unpark hooks check if guc submission has been initialized.
>>>- drop cant_sleep() from trylock hepler since rcu_read_lock takes care
>>>  of that.
>>>
>>>v7: (CI) Fix igt@i915_selftest@live@gt_engines
>>>- For guc mode of submission the engine busyness is derived from gt time
>>>  domain. Use gt time elapsed as reference in the selftest.
>>>- Increase busyness calculation to 10ms duration to ensure batch runs
>>>  longer and falls within the busyness tolerances in selftest.
>>
>>[snip]
>>
>>>diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>>index 75569666105d..24358bef6691 100644
>>>--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>>+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>>@@ -234,6 +234,7 @@ static int live_engine_busy_stats(void *arg)
>>> 		struct i915_request *rq;
>>> 		ktime_t de, dt;
>>> 		ktime_t t[2];
>>>+		u32 gt_stamp;
>>> 		if (!intel_engine_supports_stats(engine))
>>> 			continue;
>>>@@ -251,10 +252,16 @@ static int live_engine_busy_stats(void *arg)
>>> 		ENGINE_TRACE(engine, "measuring idle time\n");
>>> 		preempt_disable();
>>> 		de = intel_engine_get_busy_time(engine, &t[0]);
>>>-		udelay(100);
>>>+		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
>>>+		udelay(10000);
>>> 		de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
>>>+		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP) - gt_stamp;
>>> 		preempt_enable();
>>>-		dt = ktime_sub(t[1], t[0]);
>>>+
>>>+		dt = intel_engine_uses_guc(engine) ?
>>>+		     intel_gt_clock_interval_to_ns(engine->gt, gt_stamp) :
>>>+		     ktime_sub(t[1], t[0]);
>>
>>But this then shows the thing might not work for external callers 
>>like PMU who have no idea about GUCPMTIMESTAMP and cannot obtain it 
>>anyway.
>>
>>What is the root cause of the failure here, 100us or clock source? 
>>Is the granularity of GUCPMTIMESTAMP perhaps simply too coarse for 
>>100us test period? I forget what frequency it runs at.
>
>guc timestamp is ticking at 19.2 MHz in adlp/rkl (where I ran this).
>
>1)
>With 100us, often times I see that the batch has not yet started, so I 
>get busy time in the range 0 - 60 %. I increased the time such that 
>the batch runs long enough to make the scheduling time < 5%.
>
>2)
>I did a 100 runs on rkl/adlp. No failures on rkl.

Sorry, my bad, RKL failed with 91% busyness always (checked it again 
now). I think the first time I ran this, GuC was not enabled by default.

Regards,
Umesh

> On adlp, I saw one in 25 runs show 93%/94% busyness for rcs0 and fail 
>(expected is 95%).  For that I tried using the guc timestamp thinking 
>it would provide more accuracy. It did in my testing, but CI still 
>failed for rkl-guc (110% busyness!!), so now I just think we need to 
>tweak the expected busyness for guc.
>
>Is 1) acceptable?
>
>For 2) I am thinking of just changing the expected busyness to 90% 
>plus for guc mode OR should we just let it fail occassionally? 
>Thoughts?
>
>Thanks,
>Umesh
>
>>
>>Regards,
>>
>>Tvrtko

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
@ 2021-10-18 20:35         ` Umesh Nerlige Ramappa
  0 siblings, 0 replies; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-10-18 20:35 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: intel-gfx, dri-devel, john.c.harrison, daniel.vetter, Matthew Brost

On Mon, Oct 18, 2021 at 11:35:44AM -0700, Umesh Nerlige Ramappa wrote:
>On Mon, Oct 18, 2021 at 08:58:01AM +0100, Tvrtko Ursulin wrote:
>>
>>
>>On 16/10/2021 00:47, Umesh Nerlige Ramappa wrote:
>>>With GuC handling scheduling, i915 is not aware of the time that a
>>>context is scheduled in and out of the engine. Since i915 pmu relies on
>>>this info to provide engine busyness to the user, GuC shares this info
>>>with i915 for all engines using shared memory. For each engine, this
>>>info contains:
>>>
>>>- total busyness: total time that the context was running (total)
>>>- id: id of the running context (id)
>>>- start timestamp: timestamp when the context started running (start)
>>>
>>>At the time (now) of sampling the engine busyness, if the id is valid
>>>(!= ~0), and start is non-zero, then the context is considered to be
>>>active and the engine busyness is calculated using the below equation
>>>
>>>	engine busyness = total + (now - start)
>>>
>>>All times are obtained from the gt clock base. For inactive contexts,
>>>engine busyness is just equal to the total.
>>>
>>>The start and total values provided by GuC are 32 bits and wrap around
>>>in a few minutes. Since perf pmu provides busyness as 64 bit
>>>monotonically increasing values, there is a need for this implementation
>>>to account for overflows and extend the time to 64 bits before returning
>>>busyness to the user. In order to do that, a worker runs periodically at
>>>frequency = 1/8th the time it takes for the timestamp to wrap. As an
>>>example, that would be once in 27 seconds for a gt clock frequency of
>>>19.2 MHz.
>>>
>>>Note:
>>>There might be an overaccounting of busyness due to the fact that GuC
>>>may be updating the total and start values while kmd is reading them.
>>>(i.e kmd may read the updated total and the stale start). In such a
>>>case, user may see higher busyness value followed by smaller ones which
>>>would eventually catch up to the higher value.
>>>
>>>v2: (Tvrtko)
>>>- Include details in commit message
>>>- Move intel engine busyness function into execlist code
>>>- Use union inside engine->stats
>>>- Use natural type for ping delay jiffies
>>>- Drop active_work condition checks
>>>- Use for_each_engine if iterating all engines
>>>- Drop seq locking, use spinlock at guc level to update engine stats
>>>- Document worker specific details
>>>
>>>v3: (Tvrtko/Umesh)
>>>- Demarcate guc and execlist stat objects with comments
>>>- Document known over-accounting issue in commit
>>>- Provide a consistent view of guc state
>>>- Add hooks to gt park/unpark for guc busyness
>>>- Stop/start worker in gt park/unpark path
>>>- Drop inline
>>>- Move spinlock and worker inits to guc initialization
>>>- Drop helpers that are called only once
>>>
>>>v4: (Tvrtko/Matt/Umesh)
>>>- Drop addressed opens from commit message
>>>- Get runtime pm in ping, remove from the park path
>>>- Use cancel_delayed_work_sync in disable_submission path
>>>- Update stats during reset prepare
>>>- Skip ping if reset in progress
>>>- Explicitly name execlists and guc stats objects
>>>- Since disable_submission is called from many places, move resetting
>>>  stats to intel_guc_submission_reset_prepare
>>>
>>>v5: (Tvrtko)
>>>- Add a trylock helper that does not sleep and synchronize PMU event
>>>  callbacks and worker with gt reset
>>>
>>>v6: (CI BAT failures)
>>>- DUTs using execlist submission failed to boot since __gt_unpark is
>>>  called during i915 load. This ends up calling the guc busyness unpark
>>>  hook and results in kiskstarting an uninitialized worker. Let
>>>  park/unpark hooks check if guc submission has been initialized.
>>>- drop cant_sleep() from trylock hepler since rcu_read_lock takes care
>>>  of that.
>>>
>>>v7: (CI) Fix igt@i915_selftest@live@gt_engines
>>>- For guc mode of submission the engine busyness is derived from gt time
>>>  domain. Use gt time elapsed as reference in the selftest.
>>>- Increase busyness calculation to 10ms duration to ensure batch runs
>>>  longer and falls within the busyness tolerances in selftest.
>>
>>[snip]
>>
>>>diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>>index 75569666105d..24358bef6691 100644
>>>--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>>+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>>@@ -234,6 +234,7 @@ static int live_engine_busy_stats(void *arg)
>>> 		struct i915_request *rq;
>>> 		ktime_t de, dt;
>>> 		ktime_t t[2];
>>>+		u32 gt_stamp;
>>> 		if (!intel_engine_supports_stats(engine))
>>> 			continue;
>>>@@ -251,10 +252,16 @@ static int live_engine_busy_stats(void *arg)
>>> 		ENGINE_TRACE(engine, "measuring idle time\n");
>>> 		preempt_disable();
>>> 		de = intel_engine_get_busy_time(engine, &t[0]);
>>>-		udelay(100);
>>>+		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
>>>+		udelay(10000);
>>> 		de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
>>>+		gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP) - gt_stamp;
>>> 		preempt_enable();
>>>-		dt = ktime_sub(t[1], t[0]);
>>>+
>>>+		dt = intel_engine_uses_guc(engine) ?
>>>+		     intel_gt_clock_interval_to_ns(engine->gt, gt_stamp) :
>>>+		     ktime_sub(t[1], t[0]);
>>
>>But this then shows the thing might not work for external callers 
>>like PMU who have no idea about GUCPMTIMESTAMP and cannot obtain it 
>>anyway.
>>
>>What is the root cause of the failure here, 100us or clock source? 
>>Is the granularity of GUCPMTIMESTAMP perhaps simply too coarse for 
>>100us test period? I forget what frequency it runs at.
>
>guc timestamp is ticking at 19.2 MHz in adlp/rkl (where I ran this).
>
>1)
>With 100us, often times I see that the batch has not yet started, so I 
>get busy time in the range 0 - 60 %. I increased the time such that 
>the batch runs long enough to make the scheduling time < 5%.
>
>2)
>I did a 100 runs on rkl/adlp. No failures on rkl.

Sorry, my bad, RKL failed with 91% busyness always (checked it again 
now). I think the first time I ran this, GuC was not enabled by default.

Regards,
Umesh

> On adlp, I saw one in 25 runs show 93%/94% busyness for rcs0 and fail 
>(expected is 95%).  For that I tried using the guc timestamp thinking 
>it would provide more accuracy. It did in my testing, but CI still 
>failed for rkl-guc (110% busyness!!), so now I just think we need to 
>tweak the expected busyness for guc.
>
>Is 1) acceptable?
>
>For 2) I am thinking of just changing the expected busyness to 90% 
>plus for guc mode OR should we just let it fail occassionally? 
>Thoughts?
>
>Thanks,
>Umesh
>
>>
>>Regards,
>>
>>Tvrtko

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-18 18:35       ` [Intel-gfx] " Umesh Nerlige Ramappa
@ 2021-10-19  8:32         ` Tvrtko Ursulin
  -1 siblings, 0 replies; 31+ messages in thread
From: Tvrtko Ursulin @ 2021-10-19  8:32 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa
  Cc: intel-gfx, dri-devel, john.c.harrison, daniel.vetter, Matthew Brost


On 18/10/2021 19:35, Umesh Nerlige Ramappa wrote:
> On Mon, Oct 18, 2021 at 08:58:01AM +0100, Tvrtko Ursulin wrote:
>>
>>
>> On 16/10/2021 00:47, Umesh Nerlige Ramappa wrote:
>>> With GuC handling scheduling, i915 is not aware of the time that a
>>> context is scheduled in and out of the engine. Since i915 pmu relies on
>>> this info to provide engine busyness to the user, GuC shares this info
>>> with i915 for all engines using shared memory. For each engine, this
>>> info contains:
>>>
>>> - total busyness: total time that the context was running (total)
>>> - id: id of the running context (id)
>>> - start timestamp: timestamp when the context started running (start)
>>>
>>> At the time (now) of sampling the engine busyness, if the id is valid
>>> (!= ~0), and start is non-zero, then the context is considered to be
>>> active and the engine busyness is calculated using the below equation
>>>
>>>     engine busyness = total + (now - start)
>>>
>>> All times are obtained from the gt clock base. For inactive contexts,
>>> engine busyness is just equal to the total.
>>>
>>> The start and total values provided by GuC are 32 bits and wrap around
>>> in a few minutes. Since perf pmu provides busyness as 64 bit
>>> monotonically increasing values, there is a need for this implementation
>>> to account for overflows and extend the time to 64 bits before returning
>>> busyness to the user. In order to do that, a worker runs periodically at
>>> frequency = 1/8th the time it takes for the timestamp to wrap. As an
>>> example, that would be once in 27 seconds for a gt clock frequency of
>>> 19.2 MHz.
>>>
>>> Note:
>>> There might be an overaccounting of busyness due to the fact that GuC
>>> may be updating the total and start values while kmd is reading them.
>>> (i.e kmd may read the updated total and the stale start). In such a
>>> case, user may see higher busyness value followed by smaller ones which
>>> would eventually catch up to the higher value.
>>>
>>> v2: (Tvrtko)
>>> - Include details in commit message
>>> - Move intel engine busyness function into execlist code
>>> - Use union inside engine->stats
>>> - Use natural type for ping delay jiffies
>>> - Drop active_work condition checks
>>> - Use for_each_engine if iterating all engines
>>> - Drop seq locking, use spinlock at guc level to update engine stats
>>> - Document worker specific details
>>>
>>> v3: (Tvrtko/Umesh)
>>> - Demarcate guc and execlist stat objects with comments
>>> - Document known over-accounting issue in commit
>>> - Provide a consistent view of guc state
>>> - Add hooks to gt park/unpark for guc busyness
>>> - Stop/start worker in gt park/unpark path
>>> - Drop inline
>>> - Move spinlock and worker inits to guc initialization
>>> - Drop helpers that are called only once
>>>
>>> v4: (Tvrtko/Matt/Umesh)
>>> - Drop addressed opens from commit message
>>> - Get runtime pm in ping, remove from the park path
>>> - Use cancel_delayed_work_sync in disable_submission path
>>> - Update stats during reset prepare
>>> - Skip ping if reset in progress
>>> - Explicitly name execlists and guc stats objects
>>> - Since disable_submission is called from many places, move resetting
>>>   stats to intel_guc_submission_reset_prepare
>>>
>>> v5: (Tvrtko)
>>> - Add a trylock helper that does not sleep and synchronize PMU event
>>>   callbacks and worker with gt reset
>>>
>>> v6: (CI BAT failures)
>>> - DUTs using execlist submission failed to boot since __gt_unpark is
>>>   called during i915 load. This ends up calling the guc busyness unpark
>>>   hook and results in kiskstarting an uninitialized worker. Let
>>>   park/unpark hooks check if guc submission has been initialized.
>>> - drop cant_sleep() from trylock hepler since rcu_read_lock takes care
>>>   of that.
>>>
>>> v7: (CI) Fix igt@i915_selftest@live@gt_engines
>>> - For guc mode of submission the engine busyness is derived from gt time
>>>   domain. Use gt time elapsed as reference in the selftest.
>>> - Increase busyness calculation to 10ms duration to ensure batch runs
>>>   longer and falls within the busyness tolerances in selftest.
>>
>> [snip]
>>
>>> diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c 
>>> b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>> index 75569666105d..24358bef6691 100644
>>> --- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>> +++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>> @@ -234,6 +234,7 @@ static int live_engine_busy_stats(void *arg)
>>>          struct i915_request *rq;
>>>          ktime_t de, dt;
>>>          ktime_t t[2];
>>> +        u32 gt_stamp;
>>>          if (!intel_engine_supports_stats(engine))
>>>              continue;
>>> @@ -251,10 +252,16 @@ static int live_engine_busy_stats(void *arg)
>>>          ENGINE_TRACE(engine, "measuring idle time\n");
>>>          preempt_disable();
>>>          de = intel_engine_get_busy_time(engine, &t[0]);
>>> -        udelay(100);
>>> +        gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
>>> +        udelay(10000);
>>>          de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
>>> +        gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP) - 
>>> gt_stamp;
>>>          preempt_enable();
>>> -        dt = ktime_sub(t[1], t[0]);
>>> +
>>> +        dt = intel_engine_uses_guc(engine) ?
>>> +             intel_gt_clock_interval_to_ns(engine->gt, gt_stamp) :
>>> +             ktime_sub(t[1], t[0]);
>>
>> But this then shows the thing might not work for external callers like 
>> PMU who have no idea about GUCPMTIMESTAMP and cannot obtain it anyway.
>>
>> What is the root cause of the failure here, 100us or clock source? Is 
>> the granularity of GUCPMTIMESTAMP perhaps simply too coarse for 100us 
>> test period? I forget what frequency it runs at.
> 
> guc timestamp is ticking at 19.2 MHz in adlp/rkl (where I ran this).

So ~52ns clock granularity, right?

In which case 100us with +/- 52ns error should be max 0.05% error - is 
this math correct?

> 
> 1)
> With 100us, often times I see that the batch has not yet started, so I 
> get busy time in the range 0 - 60 %. I increased the time such that the 
> batch runs long enough to make the scheduling time < 5%.

0-60% should not be possible since there is a igt_wait_for_spinner call 
before measuring starts, which ensures spinner is executing on the GPU.

I think we first need to understand where is this 0 - 60% problem coming 
from because I don't think it is from batch not yet started.

Regards,

Tvrtko

> 
> 2)
> I did a 100 runs on rkl/adlp. No failures on rkl. On adlp, I saw one in 
> 25 runs show 93%/94% busyness for rcs0 and fail (expected is 95%). For 
> that I tried using the guc timestamp thinking it would provide more 
> accuracy. It did in my testing, but CI still failed for rkl-guc (110% 
> busyness!!), so now I just think we need to tweak the expected busyness 
> for guc.
> 
> Is 1) acceptable?
> 
> For 2) I am thinking of just changing the expected busyness to 90% plus 
> for guc mode OR should we just let it fail occassionally? Thoughts?
> 
> Thanks,
> Umesh
> 
>>
>> Regards,
>>
>> Tvrtko

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
@ 2021-10-19  8:32         ` Tvrtko Ursulin
  0 siblings, 0 replies; 31+ messages in thread
From: Tvrtko Ursulin @ 2021-10-19  8:32 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa
  Cc: intel-gfx, dri-devel, john.c.harrison, daniel.vetter, Matthew Brost


On 18/10/2021 19:35, Umesh Nerlige Ramappa wrote:
> On Mon, Oct 18, 2021 at 08:58:01AM +0100, Tvrtko Ursulin wrote:
>>
>>
>> On 16/10/2021 00:47, Umesh Nerlige Ramappa wrote:
>>> With GuC handling scheduling, i915 is not aware of the time that a
>>> context is scheduled in and out of the engine. Since i915 pmu relies on
>>> this info to provide engine busyness to the user, GuC shares this info
>>> with i915 for all engines using shared memory. For each engine, this
>>> info contains:
>>>
>>> - total busyness: total time that the context was running (total)
>>> - id: id of the running context (id)
>>> - start timestamp: timestamp when the context started running (start)
>>>
>>> At the time (now) of sampling the engine busyness, if the id is valid
>>> (!= ~0), and start is non-zero, then the context is considered to be
>>> active and the engine busyness is calculated using the below equation
>>>
>>>     engine busyness = total + (now - start)
>>>
>>> All times are obtained from the gt clock base. For inactive contexts,
>>> engine busyness is just equal to the total.
>>>
>>> The start and total values provided by GuC are 32 bits and wrap around
>>> in a few minutes. Since perf pmu provides busyness as 64 bit
>>> monotonically increasing values, there is a need for this implementation
>>> to account for overflows and extend the time to 64 bits before returning
>>> busyness to the user. In order to do that, a worker runs periodically at
>>> frequency = 1/8th the time it takes for the timestamp to wrap. As an
>>> example, that would be once in 27 seconds for a gt clock frequency of
>>> 19.2 MHz.
>>>
>>> Note:
>>> There might be an overaccounting of busyness due to the fact that GuC
>>> may be updating the total and start values while kmd is reading them.
>>> (i.e kmd may read the updated total and the stale start). In such a
>>> case, user may see higher busyness value followed by smaller ones which
>>> would eventually catch up to the higher value.
>>>
>>> v2: (Tvrtko)
>>> - Include details in commit message
>>> - Move intel engine busyness function into execlist code
>>> - Use union inside engine->stats
>>> - Use natural type for ping delay jiffies
>>> - Drop active_work condition checks
>>> - Use for_each_engine if iterating all engines
>>> - Drop seq locking, use spinlock at guc level to update engine stats
>>> - Document worker specific details
>>>
>>> v3: (Tvrtko/Umesh)
>>> - Demarcate guc and execlist stat objects with comments
>>> - Document known over-accounting issue in commit
>>> - Provide a consistent view of guc state
>>> - Add hooks to gt park/unpark for guc busyness
>>> - Stop/start worker in gt park/unpark path
>>> - Drop inline
>>> - Move spinlock and worker inits to guc initialization
>>> - Drop helpers that are called only once
>>>
>>> v4: (Tvrtko/Matt/Umesh)
>>> - Drop addressed opens from commit message
>>> - Get runtime pm in ping, remove from the park path
>>> - Use cancel_delayed_work_sync in disable_submission path
>>> - Update stats during reset prepare
>>> - Skip ping if reset in progress
>>> - Explicitly name execlists and guc stats objects
>>> - Since disable_submission is called from many places, move resetting
>>>   stats to intel_guc_submission_reset_prepare
>>>
>>> v5: (Tvrtko)
>>> - Add a trylock helper that does not sleep and synchronize PMU event
>>>   callbacks and worker with gt reset
>>>
>>> v6: (CI BAT failures)
>>> - DUTs using execlist submission failed to boot since __gt_unpark is
>>>   called during i915 load. This ends up calling the guc busyness unpark
>>>   hook and results in kiskstarting an uninitialized worker. Let
>>>   park/unpark hooks check if guc submission has been initialized.
>>> - drop cant_sleep() from trylock hepler since rcu_read_lock takes care
>>>   of that.
>>>
>>> v7: (CI) Fix igt@i915_selftest@live@gt_engines
>>> - For guc mode of submission the engine busyness is derived from gt time
>>>   domain. Use gt time elapsed as reference in the selftest.
>>> - Increase busyness calculation to 10ms duration to ensure batch runs
>>>   longer and falls within the busyness tolerances in selftest.
>>
>> [snip]
>>
>>> diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c 
>>> b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>> index 75569666105d..24358bef6691 100644
>>> --- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>> +++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>> @@ -234,6 +234,7 @@ static int live_engine_busy_stats(void *arg)
>>>          struct i915_request *rq;
>>>          ktime_t de, dt;
>>>          ktime_t t[2];
>>> +        u32 gt_stamp;
>>>          if (!intel_engine_supports_stats(engine))
>>>              continue;
>>> @@ -251,10 +252,16 @@ static int live_engine_busy_stats(void *arg)
>>>          ENGINE_TRACE(engine, "measuring idle time\n");
>>>          preempt_disable();
>>>          de = intel_engine_get_busy_time(engine, &t[0]);
>>> -        udelay(100);
>>> +        gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
>>> +        udelay(10000);
>>>          de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
>>> +        gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP) - 
>>> gt_stamp;
>>>          preempt_enable();
>>> -        dt = ktime_sub(t[1], t[0]);
>>> +
>>> +        dt = intel_engine_uses_guc(engine) ?
>>> +             intel_gt_clock_interval_to_ns(engine->gt, gt_stamp) :
>>> +             ktime_sub(t[1], t[0]);
>>
>> But this then shows the thing might not work for external callers like 
>> PMU who have no idea about GUCPMTIMESTAMP and cannot obtain it anyway.
>>
>> What is the root cause of the failure here, 100us or clock source? Is 
>> the granularity of GUCPMTIMESTAMP perhaps simply too coarse for 100us 
>> test period? I forget what frequency it runs at.
> 
> guc timestamp is ticking at 19.2 MHz in adlp/rkl (where I ran this).

So ~52ns clock granularity, right?

In which case 100us with +/- 52ns error should be max 0.05% error - is 
this math correct?

> 
> 1)
> With 100us, often times I see that the batch has not yet started, so I 
> get busy time in the range 0 - 60 %. I increased the time such that the 
> batch runs long enough to make the scheduling time < 5%.

0-60% should not be possible since there is a igt_wait_for_spinner call 
before measuring starts, which ensures spinner is executing on the GPU.

I think we first need to understand where is this 0 - 60% problem coming 
from because I don't think it is from batch not yet started.

Regards,

Tvrtko

> 
> 2)
> I did a 100 runs on rkl/adlp. No failures on rkl. On adlp, I saw one in 
> 25 runs show 93%/94% busyness for rcs0 and fail (expected is 95%). For 
> that I tried using the guc timestamp thinking it would provide more 
> accuracy. It did in my testing, but CI still failed for rkl-guc (110% 
> busyness!!), so now I just think we need to tweak the expected busyness 
> for guc.
> 
> Is 1) acceptable?
> 
> For 2) I am thinking of just changing the expected busyness to 90% plus 
> for guc mode OR should we just let it fail occassionally? Thoughts?
> 
> Thanks,
> Umesh
> 
>>
>> Regards,
>>
>> Tvrtko

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-19  8:32         ` [Intel-gfx] " Tvrtko Ursulin
@ 2021-10-20  4:41           ` Umesh Nerlige Ramappa
  -1 siblings, 0 replies; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-10-20  4:41 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: intel-gfx, dri-devel, john.c.harrison, daniel.vetter, Matthew Brost

On Tue, Oct 19, 2021 at 09:32:07AM +0100, Tvrtko Ursulin wrote:
>
>On 18/10/2021 19:35, Umesh Nerlige Ramappa wrote:
>>On Mon, Oct 18, 2021 at 08:58:01AM +0100, Tvrtko Ursulin wrote:
>>>
>>>
>>>On 16/10/2021 00:47, Umesh Nerlige Ramappa wrote:
>>>>With GuC handling scheduling, i915 is not aware of the time that a
>>>>context is scheduled in and out of the engine. Since i915 pmu relies on
>>>>this info to provide engine busyness to the user, GuC shares this info
>>>>with i915 for all engines using shared memory. For each engine, this
>>>>info contains:
>>>>
>>>>- total busyness: total time that the context was running (total)
>>>>- id: id of the running context (id)
>>>>- start timestamp: timestamp when the context started running (start)
>>>>
>>>>At the time (now) of sampling the engine busyness, if the id is valid
>>>>(!= ~0), and start is non-zero, then the context is considered to be
>>>>active and the engine busyness is calculated using the below equation
>>>>
>>>>    engine busyness = total + (now - start)
>>>>
>>>>All times are obtained from the gt clock base. For inactive contexts,
>>>>engine busyness is just equal to the total.
>>>>
>>>>The start and total values provided by GuC are 32 bits and wrap around
>>>>in a few minutes. Since perf pmu provides busyness as 64 bit
>>>>monotonically increasing values, there is a need for this implementation
>>>>to account for overflows and extend the time to 64 bits before returning
>>>>busyness to the user. In order to do that, a worker runs periodically at
>>>>frequency = 1/8th the time it takes for the timestamp to wrap. As an
>>>>example, that would be once in 27 seconds for a gt clock frequency of
>>>>19.2 MHz.
>>>>
>>>>Note:
>>>>There might be an overaccounting of busyness due to the fact that GuC
>>>>may be updating the total and start values while kmd is reading them.
>>>>(i.e kmd may read the updated total and the stale start). In such a
>>>>case, user may see higher busyness value followed by smaller ones which
>>>>would eventually catch up to the higher value.
>>>>
>>>>v2: (Tvrtko)
>>>>- Include details in commit message
>>>>- Move intel engine busyness function into execlist code
>>>>- Use union inside engine->stats
>>>>- Use natural type for ping delay jiffies
>>>>- Drop active_work condition checks
>>>>- Use for_each_engine if iterating all engines
>>>>- Drop seq locking, use spinlock at guc level to update engine stats
>>>>- Document worker specific details
>>>>
>>>>v3: (Tvrtko/Umesh)
>>>>- Demarcate guc and execlist stat objects with comments
>>>>- Document known over-accounting issue in commit
>>>>- Provide a consistent view of guc state
>>>>- Add hooks to gt park/unpark for guc busyness
>>>>- Stop/start worker in gt park/unpark path
>>>>- Drop inline
>>>>- Move spinlock and worker inits to guc initialization
>>>>- Drop helpers that are called only once
>>>>
>>>>v4: (Tvrtko/Matt/Umesh)
>>>>- Drop addressed opens from commit message
>>>>- Get runtime pm in ping, remove from the park path
>>>>- Use cancel_delayed_work_sync in disable_submission path
>>>>- Update stats during reset prepare
>>>>- Skip ping if reset in progress
>>>>- Explicitly name execlists and guc stats objects
>>>>- Since disable_submission is called from many places, move resetting
>>>>  stats to intel_guc_submission_reset_prepare
>>>>
>>>>v5: (Tvrtko)
>>>>- Add a trylock helper that does not sleep and synchronize PMU event
>>>>  callbacks and worker with gt reset
>>>>
>>>>v6: (CI BAT failures)
>>>>- DUTs using execlist submission failed to boot since __gt_unpark is
>>>>  called during i915 load. This ends up calling the guc busyness unpark
>>>>  hook and results in kiskstarting an uninitialized worker. Let
>>>>  park/unpark hooks check if guc submission has been initialized.
>>>>- drop cant_sleep() from trylock hepler since rcu_read_lock takes care
>>>>  of that.
>>>>
>>>>v7: (CI) Fix igt@i915_selftest@live@gt_engines
>>>>- For guc mode of submission the engine busyness is derived from gt time
>>>>  domain. Use gt time elapsed as reference in the selftest.
>>>>- Increase busyness calculation to 10ms duration to ensure batch runs
>>>>  longer and falls within the busyness tolerances in selftest.
>>>
>>>[snip]
>>>
>>>>diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c 
>>>>b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>>>index 75569666105d..24358bef6691 100644
>>>>--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>>>+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>>>@@ -234,6 +234,7 @@ static int live_engine_busy_stats(void *arg)
>>>>         struct i915_request *rq;
>>>>         ktime_t de, dt;
>>>>         ktime_t t[2];
>>>>+        u32 gt_stamp;
>>>>         if (!intel_engine_supports_stats(engine))
>>>>             continue;
>>>>@@ -251,10 +252,16 @@ static int live_engine_busy_stats(void *arg)
>>>>         ENGINE_TRACE(engine, "measuring idle time\n");
>>>>         preempt_disable();
>>>>         de = intel_engine_get_busy_time(engine, &t[0]);
>>>>-        udelay(100);
>>>>+        gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
>>>>+        udelay(10000);
>>>>         de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
>>>>+        gt_stamp = intel_uncore_read(gt->uncore, 
>>>>GUCPMTIMESTAMP) - gt_stamp;
>>>>         preempt_enable();
>>>>-        dt = ktime_sub(t[1], t[0]);
>>>>+
>>>>+        dt = intel_engine_uses_guc(engine) ?
>>>>+             intel_gt_clock_interval_to_ns(engine->gt, gt_stamp) :
>>>>+             ktime_sub(t[1], t[0]);
>>>
>>>But this then shows the thing might not work for external callers 
>>>like PMU who have no idea about GUCPMTIMESTAMP and cannot obtain 
>>>it anyway.
>>>
>>>What is the root cause of the failure here, 100us or clock source? 
>>>Is the granularity of GUCPMTIMESTAMP perhaps simply too coarse for 
>>>100us test period? I forget what frequency it runs at.
>>
>>guc timestamp is ticking at 19.2 MHz in adlp/rkl (where I ran this).
>
>So ~52ns clock granularity, right?
>
>In which case 100us with +/- 52ns error should be max 0.05% error - is 
>this math correct?

Correct, but correlating GPU and CPU time in this manner is not 
accurate. In some cases the MMIO read of the GUC timestamp can take 
longer and since intel_engine_get_busy_time captures ktime into t[0] and 
t[1] BEFORE the stats are calculated, there is room for error. Even if 
t[0]/t[1] are captured AFTER the stats are calculated, it's the same 
issue. Ideally the test should do

before = ktime_get()
busyness1 = intel_engine_get_busy_time()
udelay(100)
busyness2 = intel_engine_get_busy_time()
after = ktime_get()

busy% = (busyness2 - busyness1)/(after - before) * 100

Isn't that how the user would do it?

Note: Second parameter to intel_engine_get_busy_time is not uabi, it's 
internal to i915 and mostly used by selftest or rps_timer (in non SLPC 
mode).

>
>>
>>1)
>>With 100us, often times I see that the batch has not yet started, so 
>>I get busy time in the range 0 - 60 %. I increased the time such 
>>that the batch runs long enough to make the scheduling time < 5%.
>
>0-60% should not be possible since there is a igt_wait_for_spinner 
>call before measuring starts, which ensures spinner is executing on 
>the GPU.
>
>I think we first need to understand where is this 0 - 60% problem 
>coming from because I don't think it is from batch not yet started.

Looks like it's possible that the GuC would update the busyness stats 
after the batch starts (when context goes from idle to active), so 
that's where this discrepancy is creeping in from. I am thinking of 
capturing busyness before igt_wait_for_spinner and then poll for 
busyness after the wait until the busyness starts ticking. Then we can  
continue with the remaining part of the test. Thoughts?

I tried running the above and ran into busyness > 100%. That's likely 
from the ktime_get issue mentioned above.

Thanks,
Umesh

>
>Regards,
>
>Tvrtko
>
>>
>>2)
>>I did a 100 runs on rkl/adlp. No failures on rkl. On adlp, I saw one 
>>in 25 runs show 93%/94% busyness for rcs0 and fail (expected is 
>>95%). For that I tried using the guc timestamp thinking it would 
>>provide more accuracy. It did in my testing, but CI still failed for 
>>rkl-guc (110% busyness!!), so now I just think we need to tweak the 
>>expected busyness for guc.
>>
>>Is 1) acceptable?
>>
>>For 2) I am thinking of just changing the expected busyness to 90% 
>>plus for guc mode OR should we just let it fail occassionally? 
>>Thoughts?
>>
>>Thanks,
>>Umesh
>>
>>>
>>>Regards,
>>>
>>>Tvrtko

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
@ 2021-10-20  4:41           ` Umesh Nerlige Ramappa
  0 siblings, 0 replies; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-10-20  4:41 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: intel-gfx, dri-devel, john.c.harrison, daniel.vetter, Matthew Brost

On Tue, Oct 19, 2021 at 09:32:07AM +0100, Tvrtko Ursulin wrote:
>
>On 18/10/2021 19:35, Umesh Nerlige Ramappa wrote:
>>On Mon, Oct 18, 2021 at 08:58:01AM +0100, Tvrtko Ursulin wrote:
>>>
>>>
>>>On 16/10/2021 00:47, Umesh Nerlige Ramappa wrote:
>>>>With GuC handling scheduling, i915 is not aware of the time that a
>>>>context is scheduled in and out of the engine. Since i915 pmu relies on
>>>>this info to provide engine busyness to the user, GuC shares this info
>>>>with i915 for all engines using shared memory. For each engine, this
>>>>info contains:
>>>>
>>>>- total busyness: total time that the context was running (total)
>>>>- id: id of the running context (id)
>>>>- start timestamp: timestamp when the context started running (start)
>>>>
>>>>At the time (now) of sampling the engine busyness, if the id is valid
>>>>(!= ~0), and start is non-zero, then the context is considered to be
>>>>active and the engine busyness is calculated using the below equation
>>>>
>>>>    engine busyness = total + (now - start)
>>>>
>>>>All times are obtained from the gt clock base. For inactive contexts,
>>>>engine busyness is just equal to the total.
>>>>
>>>>The start and total values provided by GuC are 32 bits and wrap around
>>>>in a few minutes. Since perf pmu provides busyness as 64 bit
>>>>monotonically increasing values, there is a need for this implementation
>>>>to account for overflows and extend the time to 64 bits before returning
>>>>busyness to the user. In order to do that, a worker runs periodically at
>>>>frequency = 1/8th the time it takes for the timestamp to wrap. As an
>>>>example, that would be once in 27 seconds for a gt clock frequency of
>>>>19.2 MHz.
>>>>
>>>>Note:
>>>>There might be an overaccounting of busyness due to the fact that GuC
>>>>may be updating the total and start values while kmd is reading them.
>>>>(i.e kmd may read the updated total and the stale start). In such a
>>>>case, user may see higher busyness value followed by smaller ones which
>>>>would eventually catch up to the higher value.
>>>>
>>>>v2: (Tvrtko)
>>>>- Include details in commit message
>>>>- Move intel engine busyness function into execlist code
>>>>- Use union inside engine->stats
>>>>- Use natural type for ping delay jiffies
>>>>- Drop active_work condition checks
>>>>- Use for_each_engine if iterating all engines
>>>>- Drop seq locking, use spinlock at guc level to update engine stats
>>>>- Document worker specific details
>>>>
>>>>v3: (Tvrtko/Umesh)
>>>>- Demarcate guc and execlist stat objects with comments
>>>>- Document known over-accounting issue in commit
>>>>- Provide a consistent view of guc state
>>>>- Add hooks to gt park/unpark for guc busyness
>>>>- Stop/start worker in gt park/unpark path
>>>>- Drop inline
>>>>- Move spinlock and worker inits to guc initialization
>>>>- Drop helpers that are called only once
>>>>
>>>>v4: (Tvrtko/Matt/Umesh)
>>>>- Drop addressed opens from commit message
>>>>- Get runtime pm in ping, remove from the park path
>>>>- Use cancel_delayed_work_sync in disable_submission path
>>>>- Update stats during reset prepare
>>>>- Skip ping if reset in progress
>>>>- Explicitly name execlists and guc stats objects
>>>>- Since disable_submission is called from many places, move resetting
>>>>  stats to intel_guc_submission_reset_prepare
>>>>
>>>>v5: (Tvrtko)
>>>>- Add a trylock helper that does not sleep and synchronize PMU event
>>>>  callbacks and worker with gt reset
>>>>
>>>>v6: (CI BAT failures)
>>>>- DUTs using execlist submission failed to boot since __gt_unpark is
>>>>  called during i915 load. This ends up calling the guc busyness unpark
>>>>  hook and results in kiskstarting an uninitialized worker. Let
>>>>  park/unpark hooks check if guc submission has been initialized.
>>>>- drop cant_sleep() from trylock hepler since rcu_read_lock takes care
>>>>  of that.
>>>>
>>>>v7: (CI) Fix igt@i915_selftest@live@gt_engines
>>>>- For guc mode of submission the engine busyness is derived from gt time
>>>>  domain. Use gt time elapsed as reference in the selftest.
>>>>- Increase busyness calculation to 10ms duration to ensure batch runs
>>>>  longer and falls within the busyness tolerances in selftest.
>>>
>>>[snip]
>>>
>>>>diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c 
>>>>b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>>>index 75569666105d..24358bef6691 100644
>>>>--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>>>+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
>>>>@@ -234,6 +234,7 @@ static int live_engine_busy_stats(void *arg)
>>>>         struct i915_request *rq;
>>>>         ktime_t de, dt;
>>>>         ktime_t t[2];
>>>>+        u32 gt_stamp;
>>>>         if (!intel_engine_supports_stats(engine))
>>>>             continue;
>>>>@@ -251,10 +252,16 @@ static int live_engine_busy_stats(void *arg)
>>>>         ENGINE_TRACE(engine, "measuring idle time\n");
>>>>         preempt_disable();
>>>>         de = intel_engine_get_busy_time(engine, &t[0]);
>>>>-        udelay(100);
>>>>+        gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
>>>>+        udelay(10000);
>>>>         de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
>>>>+        gt_stamp = intel_uncore_read(gt->uncore, 
>>>>GUCPMTIMESTAMP) - gt_stamp;
>>>>         preempt_enable();
>>>>-        dt = ktime_sub(t[1], t[0]);
>>>>+
>>>>+        dt = intel_engine_uses_guc(engine) ?
>>>>+             intel_gt_clock_interval_to_ns(engine->gt, gt_stamp) :
>>>>+             ktime_sub(t[1], t[0]);
>>>
>>>But this then shows the thing might not work for external callers 
>>>like PMU who have no idea about GUCPMTIMESTAMP and cannot obtain 
>>>it anyway.
>>>
>>>What is the root cause of the failure here, 100us or clock source? 
>>>Is the granularity of GUCPMTIMESTAMP perhaps simply too coarse for 
>>>100us test period? I forget what frequency it runs at.
>>
>>guc timestamp is ticking at 19.2 MHz in adlp/rkl (where I ran this).
>
>So ~52ns clock granularity, right?
>
>In which case 100us with +/- 52ns error should be max 0.05% error - is 
>this math correct?

Correct, but correlating GPU and CPU time in this manner is not 
accurate. In some cases the MMIO read of the GUC timestamp can take 
longer and since intel_engine_get_busy_time captures ktime into t[0] and 
t[1] BEFORE the stats are calculated, there is room for error. Even if 
t[0]/t[1] are captured AFTER the stats are calculated, it's the same 
issue. Ideally the test should do

before = ktime_get()
busyness1 = intel_engine_get_busy_time()
udelay(100)
busyness2 = intel_engine_get_busy_time()
after = ktime_get()

busy% = (busyness2 - busyness1)/(after - before) * 100

Isn't that how the user would do it?

Note: Second parameter to intel_engine_get_busy_time is not uabi, it's 
internal to i915 and mostly used by selftest or rps_timer (in non SLPC 
mode).

>
>>
>>1)
>>With 100us, often times I see that the batch has not yet started, so 
>>I get busy time in the range 0 - 60 %. I increased the time such 
>>that the batch runs long enough to make the scheduling time < 5%.
>
>0-60% should not be possible since there is a igt_wait_for_spinner 
>call before measuring starts, which ensures spinner is executing on 
>the GPU.
>
>I think we first need to understand where is this 0 - 60% problem 
>coming from because I don't think it is from batch not yet started.

Looks like it's possible that the GuC would update the busyness stats 
after the batch starts (when context goes from idle to active), so 
that's where this discrepancy is creeping in from. I am thinking of 
capturing busyness before igt_wait_for_spinner and then poll for 
busyness after the wait until the busyness starts ticking. Then we can  
continue with the remaining part of the test. Thoughts?

I tried running the above and ran into busyness > 100%. That's likely 
from the ktime_get issue mentioned above.

Thanks,
Umesh

>
>Regards,
>
>Tvrtko
>
>>
>>2)
>>I did a 100 runs on rkl/adlp. No failures on rkl. On adlp, I saw one 
>>in 25 runs show 93%/94% busyness for rcs0 and fail (expected is 
>>95%). For that I tried using the guc timestamp thinking it would 
>>provide more accuracy. It did in my testing, but CI still failed for 
>>rkl-guc (110% busyness!!), so now I just think we need to tweak the 
>>expected busyness for guc.
>>
>>Is 1) acceptable?
>>
>>For 2) I am thinking of just changing the expected busyness to 90% 
>>plus for guc mode OR should we just let it fail occassionally? 
>>Thoughts?
>>
>>Thanks,
>>Umesh
>>
>>>
>>>Regards,
>>>
>>>Tvrtko

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2022-10-21  8:42   ` Tvrtko Ursulin
@ 2022-10-22  0:21     ` Umesh Nerlige Ramappa
  0 siblings, 0 replies; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2022-10-22  0:21 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: Matthew Brost, daniel.vetter, intel-gfx, john.c.harrison, dri-devel

On Fri, Oct 21, 2022 at 09:42:53AM +0100, Tvrtko Ursulin wrote:
>
>On 27/10/2021 01:48, Umesh Nerlige Ramappa wrote:
>
>[snip]
>
>>+static void guc_timestamp_ping(struct work_struct *wrk)
>>+{
>>+	struct intel_guc *guc = container_of(wrk, typeof(*guc),
>>+					     timestamp.work.work);
>>+	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>+	intel_wakeref_t wakeref;
>>+	unsigned long flags;
>>+	int srcu, ret;
>>+
>>+	/*
>>+	 * Synchronize with gt reset to make sure the worker does not
>>+	 * corrupt the engine/guc stats.
>>+	 */
>>+	ret = intel_gt_reset_trylock(gt, &srcu);
>>+	if (ret)
>>+		return;
>>+
>>+	spin_lock_irqsave(&guc->timestamp.lock, flags);
>>+
>>+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>>+		__update_guc_busyness_stats(guc);
>
>Spotted one splat today: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12268/bat-adlp-4/igt@i915_pm_rpm@basic-pci-d3-state.html
>
>Could be that reset lock needs to be inside the rpm get. Haven't really though about it much, could you please check?
>
><4> [300.214744]
><4> [300.214753] ======================================================
><4> [300.214755] WARNING: possible circular locking dependency detected
><4> [300.214758] 6.1.0-rc1-CI_DRM_12268-g86e8558e3283+ #1 Not tainted
><4> [300.214761] ------------------------------------------------------
><4> [300.214762] kworker/10:1H/265 is trying to acquire lock:
><4> [300.214765] ffffffff8275e560 (fs_reclaim){+.+.}-{0:0}, at: __kmem_cache_alloc_node+0x27/0x170
><4> [300.214780]
>but task is already holding lock:
><4> [300.214782] ffffc900013e7e78 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}, at: process_one_work+0x1eb/0x5b0
><4> [300.214793]
>which lock already depends on the new lock.
><4> [300.214794]
>the existing dependency chain (in reverse order) is:
><4> [300.214796]
>-> #2 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}:
><4> [300.214801]        lock_acquire+0xd3/0x310
><4> [300.214806]        __flush_work+0x77/0x4e0
><4> [300.214811]        __cancel_work_timer+0x14e/0x1f0
><4> [300.214815]        intel_guc_submission_reset_prepare+0x7a/0x420 [i915]
><4> [300.215119]        intel_uc_reset_prepare+0x44/0x50 [i915]
><4> [300.215360]        reset_prepare+0x21/0x80 [i915]
><4> [300.215561]        intel_gt_reset+0x143/0x340 [i915]
><4> [300.215757]        intel_gt_reset_global+0xeb/0x160 [i915]
><4> [300.215946]        intel_gt_handle_error+0x2c2/0x410 [i915]
><4> [300.216137]        intel_gt_debugfs_reset_store+0x59/0xc0 [i915]
><4> [300.216333]        i915_wedged_set+0xc/0x20 [i915]
><4> [300.216513]        simple_attr_write+0xda/0x100
><4> [300.216520]        full_proxy_write+0x4e/0x80
><4> [300.216525]        vfs_write+0xe3/0x4e0
><4> [300.216531]        ksys_write+0x57/0xd0
><4> [300.216535]        do_syscall_64+0x37/0x90
><4> [300.216542]        entry_SYSCALL_64_after_hwframe+0x63/0xcd
><4> [300.216549]
>-> #1 (&gt->reset.mutex){+.+.}-{3:3}:
><4> [300.216556]        lock_acquire+0xd3/0x310
><4> [300.216559]        i915_gem_shrinker_taints_mutex+0x2d/0x50 [i915]

i915_gem_shrinker_taints_mutex seems to have something to do with 
fs_reclaim and so does the stack #0. Any idea what this early init is 
doing? Can this code also result in a gt_wedged case because that might 
explain the stack #2 which is a reset.

><4> [300.216799]        intel_gt_init_reset+0x61/0x80 [i915]
><4> [300.217018]        intel_gt_common_init_early+0x10c/0x190 [i915]
><4> [300.217227]        intel_root_gt_init_early+0x44/0x60 [i915]
><4> [300.217434]        i915_driver_probe+0x9ab/0xf30 [i915]
><4> [300.217615]        i915_pci_probe+0xa5/0x240 [i915]
><4> [300.217796]        pci_device_probe+0x95/0x110
><4> [300.217803]        really_probe+0xd6/0x350
><4> [300.217811]        __driver_probe_device+0x73/0x170
><4> [300.217816]        driver_probe_device+0x1a/0x90
><4> [300.217821]        __driver_attach+0xbc/0x190
><4> [300.217826]        bus_for_each_dev+0x72/0xc0
><4> [300.217831]        bus_add_driver+0x1bb/0x210
><4> [300.217835]        driver_register+0x66/0xc0
><4> [300.217841]        0xffffffffa093001f
><4> [300.217844]        do_one_initcall+0x53/0x2f0
><4> [300.217849]        do_init_module+0x45/0x1c0
><4> [300.217855]        load_module+0x1d5e/0x1e90
><4> [300.217859]        __do_sys_finit_module+0xaf/0x120
><4> [300.217864]        do_syscall_64+0x37/0x90
><4> [300.217869]        entry_SYSCALL_64_after_hwframe+0x63/0xcd
><4> [300.217875]
>-> #0 (fs_reclaim){+.+.}-{0:0}:
><4> [300.217880]        validate_chain+0xb3d/0x2000
><4> [300.217884]        __lock_acquire+0x5a4/0xb70
><4> [300.217888]        lock_acquire+0xd3/0x310
><4> [300.217891]        fs_reclaim_acquire+0xa1/0xd0

fs_reclaim ^

><4> [300.217896]        __kmem_cache_alloc_node+0x27/0x170
><4> [300.217899]        __kmalloc+0x43/0x1a0
><4> [300.217903]        acpi_ns_internalize_name+0x44/0x9f
><4> [300.217909]        acpi_ns_get_node_unlocked+0x6b/0xd7
><4> [300.217914]        acpi_ns_get_node+0x3b/0x54
><4> [300.217918]        acpi_get_handle+0x89/0xb7
><4> [300.217922]        acpi_has_method+0x1c/0x40
><4> [300.217928]        acpi_pci_set_power_state+0x42/0xf0
><4> [300.217935]        pci_power_up+0x20/0x1a0
><4> [300.217940]        pci_pm_default_resume_early+0x9/0x30
><4> [300.217945]        pci_pm_runtime_resume+0x29/0x90
><4> [300.217948]        __rpm_callback+0x3d/0x110
><4> [300.217954]        rpm_callback+0x58/0x60
><4> [300.217959]        rpm_resume+0x548/0x760
><4> [300.217963]        __pm_runtime_resume+0x42/0x80
><4> [300.217968]        __intel_runtime_pm_get+0x19/0x80 [i915]
><4> [300.218170]        guc_timestamp_ping+0x63/0xc0 [i915]
><4> [300.218467]        process_one_work+0x272/0x5b0
><4> [300.218472]        worker_thread+0x37/0x370
><4> [300.218477]        kthread+0xed/0x120
><4> [300.218481]        ret_from_fork+0x1f/0x30
><4> [300.218485]

If the suspend has completed, not sure why guc_timestamp_ping is getting 
called and resulting in pci_power_up in this stack. The park should have 
synchronously canceled the ping worker. Strange.

>other info that might help us debug this:
><4> [300.218487] Chain exists of:
>  fs_reclaim --> &gt->reset.mutex --> (work_completion)(&(&guc->timestamp.work)->work)
><4> [300.218495]  Possible unsafe locking scenario:
><4> [300.218497]        CPU0                    CPU1
><4> [300.218499]        ----                    ----
><4> [300.218501]   lock((work_completion)(&(&guc->timestamp.work)->work));
><4> [300.218505]                                lock(&gt->reset.mutex);
><4> [300.218509]                                lock((work_completion)(&(&guc->timestamp.work)->work));
><4> [300.218512]   lock(fs_reclaim);
><4> [300.218515]
> *** DEADLOCK ***

Still looking into it, could use some help with the above questions.

Thanks,
Umesh
>
>Regards,
>
>Tvrtko

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-27  0:48 ` [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu Umesh Nerlige Ramappa
  2021-10-27 20:02   ` Matthew Brost
@ 2022-10-21  8:42   ` Tvrtko Ursulin
  2022-10-22  0:21     ` Umesh Nerlige Ramappa
  1 sibling, 1 reply; 31+ messages in thread
From: Tvrtko Ursulin @ 2022-10-21  8:42 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa, intel-gfx, dri-devel
  Cc: Matthew Brost, daniel.vetter, john.c.harrison


On 27/10/2021 01:48, Umesh Nerlige Ramappa wrote:

[snip]

> +static void guc_timestamp_ping(struct work_struct *wrk)
> +{
> +	struct intel_guc *guc = container_of(wrk, typeof(*guc),
> +					     timestamp.work.work);
> +	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	intel_wakeref_t wakeref;
> +	unsigned long flags;
> +	int srcu, ret;
> +
> +	/*
> +	 * Synchronize with gt reset to make sure the worker does not
> +	 * corrupt the engine/guc stats.
> +	 */
> +	ret = intel_gt_reset_trylock(gt, &srcu);
> +	if (ret)
> +		return;
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
> +		__update_guc_busyness_stats(guc);

Spotted one splat today: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12268/bat-adlp-4/igt@i915_pm_rpm@basic-pci-d3-state.html

Could be that reset lock needs to be inside the rpm get. Haven't really though about it much, could you please check?

<4> [300.214744]
<4> [300.214753] ======================================================
<4> [300.214755] WARNING: possible circular locking dependency detected
<4> [300.214758] 6.1.0-rc1-CI_DRM_12268-g86e8558e3283+ #1 Not tainted
<4> [300.214761] ------------------------------------------------------
<4> [300.214762] kworker/10:1H/265 is trying to acquire lock:
<4> [300.214765] ffffffff8275e560 (fs_reclaim){+.+.}-{0:0}, at: __kmem_cache_alloc_node+0x27/0x170
<4> [300.214780]
but task is already holding lock:
<4> [300.214782] ffffc900013e7e78 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}, at: process_one_work+0x1eb/0x5b0
<4> [300.214793]
which lock already depends on the new lock.
<4> [300.214794]
the existing dependency chain (in reverse order) is:
<4> [300.214796]
-> #2 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}:
<4> [300.214801]        lock_acquire+0xd3/0x310
<4> [300.214806]        __flush_work+0x77/0x4e0
<4> [300.214811]        __cancel_work_timer+0x14e/0x1f0
<4> [300.214815]        intel_guc_submission_reset_prepare+0x7a/0x420 [i915]
<4> [300.215119]        intel_uc_reset_prepare+0x44/0x50 [i915]
<4> [300.215360]        reset_prepare+0x21/0x80 [i915]
<4> [300.215561]        intel_gt_reset+0x143/0x340 [i915]
<4> [300.215757]        intel_gt_reset_global+0xeb/0x160 [i915]
<4> [300.215946]        intel_gt_handle_error+0x2c2/0x410 [i915]
<4> [300.216137]        intel_gt_debugfs_reset_store+0x59/0xc0 [i915]
<4> [300.216333]        i915_wedged_set+0xc/0x20 [i915]
<4> [300.216513]        simple_attr_write+0xda/0x100
<4> [300.216520]        full_proxy_write+0x4e/0x80
<4> [300.216525]        vfs_write+0xe3/0x4e0
<4> [300.216531]        ksys_write+0x57/0xd0
<4> [300.216535]        do_syscall_64+0x37/0x90
<4> [300.216542]        entry_SYSCALL_64_after_hwframe+0x63/0xcd
<4> [300.216549]
-> #1 (&gt->reset.mutex){+.+.}-{3:3}:
<4> [300.216556]        lock_acquire+0xd3/0x310
<4> [300.216559]        i915_gem_shrinker_taints_mutex+0x2d/0x50 [i915]
<4> [300.216799]        intel_gt_init_reset+0x61/0x80 [i915]
<4> [300.217018]        intel_gt_common_init_early+0x10c/0x190 [i915]
<4> [300.217227]        intel_root_gt_init_early+0x44/0x60 [i915]
<4> [300.217434]        i915_driver_probe+0x9ab/0xf30 [i915]
<4> [300.217615]        i915_pci_probe+0xa5/0x240 [i915]
<4> [300.217796]        pci_device_probe+0x95/0x110
<4> [300.217803]        really_probe+0xd6/0x350
<4> [300.217811]        __driver_probe_device+0x73/0x170
<4> [300.217816]        driver_probe_device+0x1a/0x90
<4> [300.217821]        __driver_attach+0xbc/0x190
<4> [300.217826]        bus_for_each_dev+0x72/0xc0
<4> [300.217831]        bus_add_driver+0x1bb/0x210
<4> [300.217835]        driver_register+0x66/0xc0
<4> [300.217841]        0xffffffffa093001f
<4> [300.217844]        do_one_initcall+0x53/0x2f0
<4> [300.217849]        do_init_module+0x45/0x1c0
<4> [300.217855]        load_module+0x1d5e/0x1e90
<4> [300.217859]        __do_sys_finit_module+0xaf/0x120
<4> [300.217864]        do_syscall_64+0x37/0x90
<4> [300.217869]        entry_SYSCALL_64_after_hwframe+0x63/0xcd
<4> [300.217875]
-> #0 (fs_reclaim){+.+.}-{0:0}:
<4> [300.217880]        validate_chain+0xb3d/0x2000
<4> [300.217884]        __lock_acquire+0x5a4/0xb70
<4> [300.217888]        lock_acquire+0xd3/0x310
<4> [300.217891]        fs_reclaim_acquire+0xa1/0xd0
<4> [300.217896]        __kmem_cache_alloc_node+0x27/0x170
<4> [300.217899]        __kmalloc+0x43/0x1a0
<4> [300.217903]        acpi_ns_internalize_name+0x44/0x9f
<4> [300.217909]        acpi_ns_get_node_unlocked+0x6b/0xd7
<4> [300.217914]        acpi_ns_get_node+0x3b/0x54
<4> [300.217918]        acpi_get_handle+0x89/0xb7
<4> [300.217922]        acpi_has_method+0x1c/0x40
<4> [300.217928]        acpi_pci_set_power_state+0x42/0xf0
<4> [300.217935]        pci_power_up+0x20/0x1a0
<4> [300.217940]        pci_pm_default_resume_early+0x9/0x30
<4> [300.217945]        pci_pm_runtime_resume+0x29/0x90
<4> [300.217948]        __rpm_callback+0x3d/0x110
<4> [300.217954]        rpm_callback+0x58/0x60
<4> [300.217959]        rpm_resume+0x548/0x760
<4> [300.217963]        __pm_runtime_resume+0x42/0x80
<4> [300.217968]        __intel_runtime_pm_get+0x19/0x80 [i915]
<4> [300.218170]        guc_timestamp_ping+0x63/0xc0 [i915]
<4> [300.218467]        process_one_work+0x272/0x5b0
<4> [300.218472]        worker_thread+0x37/0x370
<4> [300.218477]        kthread+0xed/0x120
<4> [300.218481]        ret_from_fork+0x1f/0x30
<4> [300.218485]
other info that might help us debug this:
<4> [300.218487] Chain exists of:
   fs_reclaim --> &gt->reset.mutex --> (work_completion)(&(&guc->timestamp.work)->work)
<4> [300.218495]  Possible unsafe locking scenario:
<4> [300.218497]        CPU0                    CPU1
<4> [300.218499]        ----                    ----
<4> [300.218501]   lock((work_completion)(&(&guc->timestamp.work)->work));
<4> [300.218505]                                lock(&gt->reset.mutex);
<4> [300.218509]                                lock((work_completion)(&(&guc->timestamp.work)->work));
<4> [300.218512]   lock(fs_reclaim);
<4> [300.218515]
  *** DEADLOCK ***

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-27  0:48 ` [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu Umesh Nerlige Ramappa
@ 2021-10-27 20:02   ` Matthew Brost
  2022-10-21  8:42   ` Tvrtko Ursulin
  1 sibling, 0 replies; 31+ messages in thread
From: Matthew Brost @ 2021-10-27 20:02 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa
  Cc: intel-gfx, dri-devel, john.c.harrison, Tvrtko Ursulin, daniel.vetter

On Tue, Oct 26, 2021 at 05:48:21PM -0700, Umesh Nerlige Ramappa wrote:
> With GuC handling scheduling, i915 is not aware of the time that a
> context is scheduled in and out of the engine. Since i915 pmu relies on
> this info to provide engine busyness to the user, GuC shares this info
> with i915 for all engines using shared memory. For each engine, this
> info contains:
> 
> - total busyness: total time that the context was running (total)
> - id: id of the running context (id)
> - start timestamp: timestamp when the context started running (start)
> 
> At the time (now) of sampling the engine busyness, if the id is valid
> (!= ~0), and start is non-zero, then the context is considered to be
> active and the engine busyness is calculated using the below equation
> 
> 	engine busyness = total + (now - start)
> 
> All times are obtained from the gt clock base. For inactive contexts,
> engine busyness is just equal to the total.
> 
> The start and total values provided by GuC are 32 bits and wrap around
> in a few minutes. Since perf pmu provides busyness as 64 bit
> monotonically increasing values, there is a need for this implementation
> to account for overflows and extend the time to 64 bits before returning
> busyness to the user. In order to do that, a worker runs periodically at
> frequency = 1/8th the time it takes for the timestamp to wrap. As an
> example, that would be once in 27 seconds for a gt clock frequency of
> 19.2 MHz.
> 
> Note:
> There might be an over-accounting of busyness due to the fact that GuC
> may be updating the total and start values while kmd is reading them.
> (i.e kmd may read the updated total and the stale start). In such a
> case, user may see higher busyness value followed by smaller ones which
> would eventually catch up to the higher value.
> 
> v2: (Tvrtko)
> - Include details in commit message
> - Move intel engine busyness function into execlist code
> - Use union inside engine->stats
> - Use natural type for ping delay jiffies
> - Drop active_work condition checks
> - Use for_each_engine if iterating all engines
> - Drop seq locking, use spinlock at GuC level to update engine stats
> - Document worker specific details
> 
> v3: (Tvrtko/Umesh)
> - Demarcate GuC and execlist stat objects with comments
> - Document known over-accounting issue in commit
> - Provide a consistent view of GuC state
> - Add hooks to gt park/unpark for GuC busyness
> - Stop/start worker in gt park/unpark path
> - Drop inline
> - Move spinlock and worker inits to GuC initialization
> - Drop helpers that are called only once
> 
> v4: (Tvrtko/Matt/Umesh)
> - Drop addressed opens from commit message
> - Get runtime pm in ping, remove from the park path
> - Use cancel_delayed_work_sync in disable_submission path
> - Update stats during reset prepare
> - Skip ping if reset in progress
> - Explicitly name execlists and GuC stats objects
> - Since disable_submission is called from many places, move resetting
>   stats to intel_guc_submission_reset_prepare
> 
> v5: (Tvrtko)
> - Add a trylock helper that does not sleep and synchronize PMU event
>   callbacks and worker with gt reset
> 
> v6: (CI BAT failures)
> - DUTs using execlist submission failed to boot since __gt_unpark is
>   called during i915 load. This ends up calling the GuC busyness unpark
>   hook and results in kick-starting an uninitialized worker. Let
>   park/unpark hooks check if GuC submission has been initialized.
> - drop cant_sleep() from trylock helper since rcu_read_lock takes care
>   of that.
> 
> v7: (CI) Fix igt@i915_selftest@live@gt_engines
> - For GuC mode of submission the engine busyness is derived from gt time
>   domain. Use gt time elapsed as reference in the selftest.
> - Increase busyness calculation to 10ms duration to ensure batch runs
>   longer and falls within the busyness tolerances in selftest.
> 
> v8:
> - Use ktime_get in selftest as before
> - intel_reset_trylock_no_wait results in a lockdep splat that is not
>   trivial to fix since the PMU callback runs in irq context and the
>   reset paths are tightly knit into the driver. The test that uncovers
>   this is igt@perf_pmu@faulting-read. Drop intel_reset_trylock_no_wait,
>   instead use the reset_count to synchronize with gt reset during pmu
>   callback. For the ping, continue to use intel_reset_trylock since ping
>   is not run in irq context.
> 
> - GuC PM timestamp does not tick when GuC is idle. This can potentially
>   result in wrong busyness values when a context is active on the
>   engine, but GuC is idle. Use the RING TIMESTAMP as GPU timestamp to
>   process the GuC busyness stats. This works since both GuC timestamp and
>   RING timestamp are synced with the same clock.
> 
> - The busyness stats may get updated after the batch starts running.
>   This delay causes the busyness reported for 100us duration to fall
>   below 95% in the selftest. The only option at this time is to wait for
>   GuC busyness to change from idle to active before we sample busyness
>   over a 100us period.
> 
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  28 +-
>  drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
>  .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
>  drivers/gpu/drm/i915/gt/intel_gt_pm.c         |   2 +
>  drivers/gpu/drm/i915/gt/selftest_engine_pm.c  |  33 +++
>  .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
>  drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  30 ++
>  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  21 ++
>  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   5 +
>  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 277 ++++++++++++++++++
>  .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
>  drivers/gpu/drm/i915/i915_reg.h               |   2 +
>  13 files changed, 453 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 2de396e34d83..332756036007 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -1915,23 +1915,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>  	intel_engine_print_breadcrumbs(engine, m);
>  }
>  
> -static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
> -					    ktime_t *now)
> -{
> -	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
> -	ktime_t total = stats->total;
> -
> -	/*
> -	 * If the engine is executing something at the moment
> -	 * add it to the total.
> -	 */
> -	*now = ktime_get();
> -	if (READ_ONCE(stats->active))
> -		total = ktime_add(total, ktime_sub(*now, stats->start));
> -
> -	return total;
> -}
> -
>  /**
>   * intel_engine_get_busy_time() - Return current accumulated engine busyness
>   * @engine: engine to report on
> @@ -1941,16 +1924,7 @@ static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
>   */
>  ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now)
>  {
> -	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
> -	unsigned int seq;
> -	ktime_t total;
> -
> -	do {
> -		seq = read_seqcount_begin(&stats->lock);
> -		total = __intel_engine_get_busy_time(engine, now);
> -	} while (read_seqcount_retry(&stats->lock, seq));
> -
> -	return total;
> +	return engine->busyness(engine, now);
>  }
>  
>  struct intel_context *
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 24fa7fb0e7de..5732e0d71513 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -284,6 +284,28 @@ struct intel_engine_execlists_stats {
>  	ktime_t start;
>  };
>  
> +struct intel_engine_guc_stats {
> +	/**
> +	 * @running: Active state of the engine when busyness was last sampled.
> +	 */
> +	bool running;
> +
> +	/**
> +	 * @prev_total: Previous value of total runtime clock cycles.
> +	 */
> +	u32 prev_total;
> +
> +	/**
> +	 * @total_gt_clks: Total gt clock cycles this engine was busy.
> +	 */
> +	u64 total_gt_clks;
> +
> +	/**
> +	 * @start_gt_clk: GT clock time of last idle to active transition.
> +	 */
> +	u64 start_gt_clk;
> +};
> +
>  struct intel_engine_cs {
>  	struct drm_i915_private *i915;
>  	struct intel_gt *gt;
> @@ -466,6 +488,12 @@ struct intel_engine_cs {
>  	void		(*add_active_request)(struct i915_request *rq);
>  	void		(*remove_active_request)(struct i915_request *rq);
>  
> +	/*
> +	 * Get engine busyness and the time at which the busyness was sampled.
> +	 */
> +	ktime_t		(*busyness)(struct intel_engine_cs *engine,
> +				    ktime_t *now);
> +
>  	struct intel_engine_execlists execlists;
>  
>  	/*
> @@ -515,7 +543,10 @@ struct intel_engine_cs {
>  	u32 (*get_cmd_length_mask)(u32 cmd_header);
>  
>  	struct {
> -		struct intel_engine_execlists_stats execlists;
> +		union {
> +			struct intel_engine_execlists_stats execlists;
> +			struct intel_engine_guc_stats guc;
> +		};
>  
>  		/**
>  		 * @rps: Utilisation at last RPS sampling.
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index bedb80057046..ca03880fa7e4 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -3293,6 +3293,38 @@ static void execlists_release(struct intel_engine_cs *engine)
>  	lrc_fini_wa_ctx(engine);
>  }
>  
> +static ktime_t __execlists_engine_busyness(struct intel_engine_cs *engine,
> +					   ktime_t *now)
> +{
> +	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
> +	ktime_t total = stats->total;
> +
> +	/*
> +	 * If the engine is executing something at the moment
> +	 * add it to the total.
> +	 */
> +	*now = ktime_get();
> +	if (READ_ONCE(stats->active))
> +		total = ktime_add(total, ktime_sub(*now, stats->start));
> +
> +	return total;
> +}
> +
> +static ktime_t execlists_engine_busyness(struct intel_engine_cs *engine,
> +					 ktime_t *now)
> +{
> +	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
> +	unsigned int seq;
> +	ktime_t total;
> +
> +	do {
> +		seq = read_seqcount_begin(&stats->lock);
> +		total = __execlists_engine_busyness(engine, now);
> +	} while (read_seqcount_retry(&stats->lock, seq));
> +
> +	return total;
> +}
> +
>  static void
>  logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>  {
> @@ -3349,6 +3381,8 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>  		engine->emit_bb_start = gen8_emit_bb_start;
>  	else
>  		engine->emit_bb_start = gen8_emit_bb_start_noarb;
> +
> +	engine->busyness = execlists_engine_busyness;
>  }
>  
>  static void logical_ring_default_irqs(struct intel_engine_cs *engine)
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> index 524eaf678790..b4a8594bc46c 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> @@ -86,6 +86,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
>  	intel_rc6_unpark(&gt->rc6);
>  	intel_rps_unpark(&gt->rps);
>  	i915_pmu_gt_unparked(i915);
> +	intel_guc_busyness_unpark(gt);
>  
>  	intel_gt_unpark_requests(gt);
>  	runtime_begin(gt);
> @@ -104,6 +105,7 @@ static int __gt_park(struct intel_wakeref *wf)
>  	runtime_end(gt);
>  	intel_gt_park_requests(gt);
>  
> +	intel_guc_busyness_park(gt);
>  	i915_vma_parked(gt);
>  	i915_pmu_gt_parked(i915);
>  	intel_rps_park(&gt->rps);
> diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
> index 75569666105d..0bfd738dbf3a 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
> @@ -214,6 +214,31 @@ static int live_engine_timestamps(void *arg)
>  	return 0;
>  }
>  
> +static int __spin_until_busier(struct intel_engine_cs *engine, ktime_t busyness)
> +{
> +	ktime_t start, unused, dt;
> +
> +	if (!intel_engine_uses_guc(engine))
> +		return 0;
> +
> +	/*
> +	 * In GuC mode of submission, the busyness stats may get updated after
> +	 * the batch starts running. Poll for a change in busyness and timeout
> +	 * after 500 us.
> +	 */
> +	start = ktime_get();
> +	while (intel_engine_get_busy_time(engine, &unused) == busyness) {
> +		dt = ktime_get() - start;
> +		if (dt > 500000) {
> +			pr_err("active wait timed out %lld\n", dt);
> +			ENGINE_TRACE(engine, "active wait time out %lld\n", dt);
> +			return -ETIME;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static int live_engine_busy_stats(void *arg)
>  {
>  	struct intel_gt *gt = arg;
> @@ -232,6 +257,7 @@ static int live_engine_busy_stats(void *arg)
>  	GEM_BUG_ON(intel_gt_pm_is_awake(gt));
>  	for_each_engine(engine, gt, id) {
>  		struct i915_request *rq;
> +		ktime_t busyness, dummy;
>  		ktime_t de, dt;
>  		ktime_t t[2];
>  
> @@ -274,12 +300,19 @@ static int live_engine_busy_stats(void *arg)
>  		}
>  		i915_request_add(rq);
>  
> +		busyness = intel_engine_get_busy_time(engine, &dummy);
>  		if (!igt_wait_for_spinner(&spin, rq)) {
>  			intel_gt_set_wedged(engine->gt);
>  			err = -ETIME;
>  			goto end;
>  		}
>  
> +		err = __spin_until_busier(engine, busyness);
> +		if (err) {
> +			GEM_TRACE_DUMP();
> +			goto end;
> +		}
> +
>  		ENGINE_TRACE(engine, "measuring busy time\n");
>  		preempt_disable();
>  		de = intel_engine_get_busy_time(engine, &t[0]);
> diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
> index ba10bd374cee..fe5d7d261797 100644
> --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
> +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
> @@ -144,6 +144,7 @@ enum intel_guc_action {
>  	INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
>  	INTEL_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC = 0x4601,
>  	INTEL_GUC_ACTION_RESET_CLIENT = 0x5507,
> +	INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
>  	INTEL_GUC_ACTION_LIMIT
>  };
>  
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> index 31cf9fb48c7e..1cb46098030d 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> @@ -138,6 +138,8 @@ struct intel_guc {
>  	u32 ads_regset_size;
>  	/** @ads_golden_ctxt_size: size of the golden contexts in the ADS */
>  	u32 ads_golden_ctxt_size;
> +	/** @ads_engine_usage_size: size of engine usage in the ADS */
> +	u32 ads_engine_usage_size;
>  
>  	/** @lrc_desc_pool: object allocated to hold the GuC LRC descriptor pool */
>  	struct i915_vma *lrc_desc_pool;
> @@ -172,6 +174,34 @@ struct intel_guc {
>  
>  	/** @send_mutex: used to serialize the intel_guc_send actions */
>  	struct mutex send_mutex;
> +
> +	/**
> +	 * @timestamp: GT timestamp object that stores a copy of the timestamp
> +	 * and adjusts it for overflow using a worker.
> +	 */
> +	struct {
> +		/**
> +		 * @lock: Lock protecting the below fields and the engine stats.
> +		 */
> +		spinlock_t lock;
> +
> +		/**
> +		 * @gt_stamp: 64 bit extended value of the GT timestamp.
> +		 */
> +		u64 gt_stamp;
> +
> +		/**
> +		 * @ping_delay: Period for polling the GT timestamp for
> +		 * overflow.
> +		 */
> +		unsigned long ping_delay;
> +
> +		/**
> +		 * @work: Periodic work to adjust GT timestamp, engine and
> +		 * context usage for overflows.
> +		 */
> +		struct delayed_work work;
> +	} timestamp;
>  };
>  
>  static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
> index 621c893a009f..1a1edae67e4e 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
> @@ -26,6 +26,8 @@
>   *      | guc_policies                          |
>   *      +---------------------------------------+
>   *      | guc_gt_system_info                    |
> + *      +---------------------------------------+
> + *      | guc_engine_usage                      |
>   *      +---------------------------------------+ <== static
>   *      | guc_mmio_reg[countA] (engine 0.0)     |
>   *      | guc_mmio_reg[countB] (engine 0.1)     |
> @@ -47,6 +49,7 @@ struct __guc_ads_blob {
>  	struct guc_ads ads;
>  	struct guc_policies policies;
>  	struct guc_gt_system_info system_info;
> +	struct guc_engine_usage engine_usage;
>  	/* From here on, location is dynamic! Refer to above diagram. */
>  	struct guc_mmio_reg regset[0];
>  } __packed;
> @@ -628,3 +631,21 @@ void intel_guc_ads_reset(struct intel_guc *guc)
>  
>  	guc_ads_private_data_reset(guc);
>  }
> +
> +u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
> +{
> +	struct __guc_ads_blob *blob = guc->ads_blob;
> +	u32 base = intel_guc_ggtt_offset(guc, guc->ads_vma);
> +	u32 offset = base + ptr_offset(blob, engine_usage);
> +
> +	return offset;
> +}
> +
> +struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine)
> +{
> +	struct intel_guc *guc = &engine->gt->uc.guc;
> +	struct __guc_ads_blob *blob = guc->ads_blob;
> +	u8 guc_class = engine_class_to_guc_class(engine->class);
> +
> +	return &blob->engine_usage.engines[guc_class][ilog2(engine->logical_mask)];
> +}
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
> index 3d85051d57e4..e74c110facff 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
> @@ -6,8 +6,11 @@
>  #ifndef _INTEL_GUC_ADS_H_
>  #define _INTEL_GUC_ADS_H_
>  
> +#include <linux/types.h>
> +
>  struct intel_guc;
>  struct drm_printer;
> +struct intel_engine_cs;
>  
>  int intel_guc_ads_create(struct intel_guc *guc);
>  void intel_guc_ads_destroy(struct intel_guc *guc);
> @@ -15,5 +18,7 @@ void intel_guc_ads_init_late(struct intel_guc *guc);
>  void intel_guc_ads_reset(struct intel_guc *guc);
>  void intel_guc_ads_print_policy_info(struct intel_guc *guc,
>  				     struct drm_printer *p);
> +struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine);
> +u32 intel_guc_engine_usage_offset(struct intel_guc *guc);
>  
>  #endif
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> index 722933e26347..7072e30e99f4 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> @@ -294,6 +294,19 @@ struct guc_ads {
>  	u32 reserved[15];
>  } __packed;
>  
> +/* Engine usage stats */
> +struct guc_engine_usage_record {
> +	u32 current_context_index;
> +	u32 last_switch_in_stamp;
> +	u32 reserved0;
> +	u32 total_runtime;
> +	u32 reserved1[4];
> +} __packed;
> +
> +struct guc_engine_usage {
> +	struct guc_engine_usage_record engines[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];

Again like I mentioned in the previous patch, I'd define this
sub-structure inline. But that is just my opinion and doesn't really
matter. I believe I understand everything else this patch is doing and
it looks good to me.

With that:
Reviewed-by: Matthew Brost <matthew.brost@intel.com> 

> +} __packed;
> +
>  /* GuC logging structures */
>  
>  enum guc_log_buffer_type {
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 38b47e73e35d..5cc49c0b3889 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -13,6 +13,7 @@
>  #include "gt/intel_engine_heartbeat.h"
>  #include "gt/intel_gpu_commands.h"
>  #include "gt/intel_gt.h"
> +#include "gt/intel_gt_clock_utils.h"
>  #include "gt/intel_gt_irq.h"
>  #include "gt/intel_gt_pm.h"
>  #include "gt/intel_gt_requests.h"
> @@ -21,6 +22,7 @@
>  #include "gt/intel_mocs.h"
>  #include "gt/intel_ring.h"
>  
> +#include "intel_guc_ads.h"
>  #include "intel_guc_submission.h"
>  
>  #include "i915_drv.h"
> @@ -1077,6 +1079,272 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
>  	xa_unlock_irqrestore(&guc->context_lookup, flags);
>  }
>  
> +/*
> + * GuC stores busyness stats for each engine at context in/out boundaries. A
> + * context 'in' logs execution start time, 'out' adds in -> out delta to total.
> + * i915/kmd accesses 'start', 'total' and 'context id' from memory shared with
> + * GuC.
> + *
> + * __i915_pmu_event_read samples engine busyness. When sampling, if context id
> + * is valid (!= ~0) and start is non-zero, the engine is considered to be
> + * active. For an active engine total busyness = total + (now - start), where
> + * 'now' is the time at which the busyness is sampled. For inactive engine,
> + * total busyness = total.
> + *
> + * All times are captured from GUCPMTIMESTAMP reg and are in gt clock domain.
> + *
> + * The start and total values provided by GuC are 32 bits and wrap around in a
> + * few minutes. Since perf pmu provides busyness as 64 bit monotonically
> + * increasing ns values, there is a need for this implementation to account for
> + * overflows and extend the GuC provided values to 64 bits before returning
> + * busyness to the user. In order to do that, a worker runs periodically at
> + * frequency = 1/8th the time it takes for the timestamp to wrap (i.e. once in
> + * 27 seconds for a gt clock frequency of 19.2 MHz).
> + */
> +
> +#define WRAP_TIME_CLKS U32_MAX
> +#define POLL_TIME_CLKS (WRAP_TIME_CLKS >> 3)
> +
> +static void
> +__extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start)
> +{
> +	u32 gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
> +	u32 gt_stamp_last = lower_32_bits(guc->timestamp.gt_stamp);
> +
> +	if (new_start == lower_32_bits(*prev_start))
> +		return;
> +
> +	if (new_start < gt_stamp_last &&
> +	    (new_start - gt_stamp_last) <= POLL_TIME_CLKS)
> +		gt_stamp_hi++;
> +
> +	if (new_start > gt_stamp_last &&
> +	    (gt_stamp_last - new_start) <= POLL_TIME_CLKS && gt_stamp_hi)
> +		gt_stamp_hi--;
> +
> +	*prev_start = ((u64)gt_stamp_hi << 32) | new_start;
> +}
> +
> +static void guc_update_engine_gt_clks(struct intel_engine_cs *engine)
> +{
> +	struct guc_engine_usage_record *rec = intel_guc_engine_usage(engine);
> +	struct intel_engine_guc_stats *stats = &engine->stats.guc;
> +	struct intel_guc *guc = &engine->gt->uc.guc;
> +	u32 last_switch = rec->last_switch_in_stamp;
> +	u32 ctx_id = rec->current_context_index;
> +	u32 total = rec->total_runtime;
> +
> +	lockdep_assert_held(&guc->timestamp.lock);
> +
> +	stats->running = ctx_id != ~0U && last_switch;
> +	if (stats->running)
> +		__extend_last_switch(guc, &stats->start_gt_clk, last_switch);
> +
> +	/*
> +	 * Instead of adjusting the total for overflow, just add the
> +	 * difference from previous sample stats->total_gt_clks
> +	 */
> +	if (total && total != ~0U) {
> +		stats->total_gt_clks += (u32)(total - stats->prev_total);
> +		stats->prev_total = total;
> +	}
> +}
> +
> +static void guc_update_pm_timestamp(struct intel_guc *guc,
> +				    struct intel_engine_cs *engine,
> +				    ktime_t *now)
> +{
> +	u32 gt_stamp_now, gt_stamp_hi;
> +
> +	lockdep_assert_held(&guc->timestamp.lock);
> +
> +	gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
> +	gt_stamp_now = intel_uncore_read(engine->uncore,
> +					 RING_TIMESTAMP(engine->mmio_base));
> +	*now = ktime_get();
> +
> +	if (gt_stamp_now < lower_32_bits(guc->timestamp.gt_stamp))
> +		gt_stamp_hi++;
> +
> +	guc->timestamp.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_now;
> +}
> +
> +/*
> + * Unlike the execlist mode of submission total and active times are in terms of
> + * gt clocks. The *now parameter is retained to return the cpu time at which the
> + * busyness was sampled.
> + */
> +static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
> +{
> +	struct intel_engine_guc_stats stats_saved, *stats = &engine->stats.guc;
> +	struct i915_gpu_error *gpu_error = &engine->i915->gpu_error;
> +	struct intel_gt *gt = engine->gt;
> +	struct intel_guc *guc = &gt->uc.guc;
> +	u64 total, gt_stamp_saved;
> +	unsigned long flags;
> +	u32 reset_count;
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	/*
> +	 * If a reset happened, we risk reading partially updated
> +	 * engine busyness from GuC, so we just use the driver stored
> +	 * copy of busyness. Synchronize with gt reset using reset_count.
> +	 */
> +	reset_count = i915_reset_count(gpu_error);
> +
> +	*now = ktime_get();
> +
> +	/*
> +	 * The active busyness depends on start_gt_clk and gt_stamp.
> +	 * gt_stamp is updated by i915 only when gt is awake and the
> +	 * start_gt_clk is derived from GuC state. To get a consistent
> +	 * view of activity, we query the GuC state only if gt is awake.
> +	 */
> +	stats_saved = *stats;
> +	gt_stamp_saved = guc->timestamp.gt_stamp;
> +	if (intel_gt_pm_get_if_awake(gt)) {
> +		guc_update_engine_gt_clks(engine);
> +		guc_update_pm_timestamp(guc, engine, now);
> +		intel_gt_pm_put_async(gt);
> +		if (i915_reset_count(gpu_error) != reset_count) {
> +			*stats = stats_saved;
> +			guc->timestamp.gt_stamp = gt_stamp_saved;
> +		}
> +	}
> +
> +	total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks);
> +	if (stats->running) {
> +		u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk;
> +
> +		total += intel_gt_clock_interval_to_ns(gt, clk);
> +	}
> +
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +
> +	return ns_to_ktime(total);
> +}
> +
> +static void __reset_guc_busyness_stats(struct intel_guc *guc)
> +{
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	unsigned long flags;
> +	ktime_t unused;
> +
> +	cancel_delayed_work_sync(&guc->timestamp.work);
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	for_each_engine(engine, gt, id) {
> +		guc_update_pm_timestamp(guc, engine, &unused);
> +		guc_update_engine_gt_clks(engine);
> +		engine->stats.guc.prev_total = 0;
> +	}
> +
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +}
> +
> +static void __update_guc_busyness_stats(struct intel_guc *guc)
> +{
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	ktime_t unused;
> +
> +	for_each_engine(engine, gt, id) {
> +		guc_update_pm_timestamp(guc, engine, &unused);
> +		guc_update_engine_gt_clks(engine);
> +	}
> +}
> +
> +static void guc_timestamp_ping(struct work_struct *wrk)
> +{
> +	struct intel_guc *guc = container_of(wrk, typeof(*guc),
> +					     timestamp.work.work);
> +	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	intel_wakeref_t wakeref;
> +	unsigned long flags;
> +	int srcu, ret;
> +
> +	/*
> +	 * Synchronize with gt reset to make sure the worker does not
> +	 * corrupt the engine/guc stats.
> +	 */
> +	ret = intel_gt_reset_trylock(gt, &srcu);
> +	if (ret)
> +		return;
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
> +		__update_guc_busyness_stats(guc);
> +
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +
> +	intel_gt_reset_unlock(gt, srcu);
> +
> +	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
> +			 guc->timestamp.ping_delay);
> +}
> +
> +static int guc_action_enable_usage_stats(struct intel_guc *guc)
> +{
> +	u32 offset = intel_guc_engine_usage_offset(guc);
> +	u32 action[] = {
> +		INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF,
> +		offset,
> +		0,
> +	};
> +
> +	return intel_guc_send(guc, action, ARRAY_SIZE(action));
> +}
> +
> +static void guc_init_engine_stats(struct intel_guc *guc)
> +{
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	intel_wakeref_t wakeref;
> +
> +	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
> +			 guc->timestamp.ping_delay);
> +
> +	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref) {
> +		int ret = guc_action_enable_usage_stats(guc);
> +
> +		if (ret)
> +			drm_err(&gt->i915->drm,
> +				"Failed to enable usage stats: %d!\n", ret);
> +	}
> +}
> +
> +void intel_guc_busyness_park(struct intel_gt *gt)
> +{
> +	struct intel_guc *guc = &gt->uc.guc;
> +	unsigned long flags;
> +
> +	if (!guc_submission_initialized(guc))
> +		return;
> +
> +	cancel_delayed_work(&guc->timestamp.work);
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +	__update_guc_busyness_stats(guc);
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +}
> +
> +void intel_guc_busyness_unpark(struct intel_gt *gt)
> +{
> +	struct intel_guc *guc = &gt->uc.guc;
> +
> +	if (!guc_submission_initialized(guc))
> +		return;
> +
> +	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
> +			 guc->timestamp.ping_delay);
> +}
> +
>  static inline bool
>  submission_disabled(struct intel_guc *guc)
>  {
> @@ -1138,6 +1406,7 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc)
>  	intel_gt_park_heartbeats(guc_to_gt(guc));
>  	disable_submission(guc);
>  	guc->interrupts.disable(guc);
> +	__reset_guc_busyness_stats(guc);
>  
>  	/* Flush IRQ handler */
>  	spin_lock_irq(&guc_to_gt(guc)->irq_lock);
> @@ -1484,6 +1753,7 @@ static void destroyed_worker_func(struct work_struct *w);
>   */
>  int intel_guc_submission_init(struct intel_guc *guc)
>  {
> +	struct intel_gt *gt = guc_to_gt(guc);
>  	int ret;
>  
>  	if (guc->lrc_desc_pool)
> @@ -1512,6 +1782,10 @@ int intel_guc_submission_init(struct intel_guc *guc)
>  	if (!guc->submission_state.guc_ids_bitmap)
>  		return -ENOMEM;
>  
> +	spin_lock_init(&guc->timestamp.lock);
> +	INIT_DELAYED_WORK(&guc->timestamp.work, guc_timestamp_ping);
> +	guc->timestamp.ping_delay = (POLL_TIME_CLKS / gt->clock_frequency + 1) * HZ;
> +
>  	return 0;
>  }
>  
> @@ -3369,7 +3643,9 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
>  		engine->emit_flush = gen12_emit_flush_xcs;
>  	}
>  	engine->set_default_submission = guc_set_default_submission;
> +	engine->busyness = guc_engine_busyness;
>  
> +	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
>  	engine->flags |= I915_ENGINE_HAS_PREEMPTION;
>  	engine->flags |= I915_ENGINE_HAS_TIMESLICES;
>  
> @@ -3468,6 +3744,7 @@ int intel_guc_submission_setup(struct intel_engine_cs *engine)
>  void intel_guc_submission_enable(struct intel_guc *guc)
>  {
>  	guc_init_lrc_mapping(guc);
> +	guc_init_engine_stats(guc);
>  }
>  
>  void intel_guc_submission_disable(struct intel_guc *guc)
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> index c7ef44fa0c36..5a95a9f0a8e3 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> @@ -28,6 +28,8 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
>  void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
>  				    struct i915_request *hung_rq,
>  				    struct drm_printer *m);
> +void intel_guc_busyness_park(struct intel_gt *gt);
> +void intel_guc_busyness_unpark(struct intel_gt *gt);
>  
>  bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
>  
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index d9f7a729333f..f7927f6dac6e 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -2662,6 +2662,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>  #define   RING_WAIT		(1 << 11) /* gen3+, PRBx_CTL */
>  #define   RING_WAIT_SEMAPHORE	(1 << 10) /* gen6+ */
>  
> +#define GUCPMTIMESTAMP          _MMIO(0xC3E8)
> +
>  /* There are 16 64-bit CS General Purpose Registers per-engine on Gen8+ */
>  #define GEN8_RING_CS_GPR(base, n)	_MMIO((base) + 0x600 + (n) * 8)
>  #define GEN8_RING_CS_GPR_UDW(base, n)	_MMIO((base) + 0x600 + (n) * 8 + 4)
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-27  0:48 [PATCH 1/2] " Umesh Nerlige Ramappa
@ 2021-10-27  0:48 ` Umesh Nerlige Ramappa
  2021-10-27 20:02   ` Matthew Brost
  2022-10-21  8:42   ` Tvrtko Ursulin
  0 siblings, 2 replies; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-10-27  0:48 UTC (permalink / raw)
  To: intel-gfx, dri-devel
  Cc: john.c.harrison, Tvrtko Ursulin, daniel.vetter, Matthew Brost

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

	engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an over-accounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at GuC level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate GuC and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of GuC state
- Add hooks to gt park/unpark for GuC busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to GuC initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and GuC stats objects
- Since disable_submission is called from many places, move resetting
  stats to intel_guc_submission_reset_prepare

v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
  callbacks and worker with gt reset

v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
  called during i915 load. This ends up calling the GuC busyness unpark
  hook and results in kick-starting an uninitialized worker. Let
  park/unpark hooks check if GuC submission has been initialized.
- drop cant_sleep() from trylock helper since rcu_read_lock takes care
  of that.

v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For GuC mode of submission the engine busyness is derived from gt time
  domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
  longer and falls within the busyness tolerances in selftest.

v8:
- Use ktime_get in selftest as before
- intel_reset_trylock_no_wait results in a lockdep splat that is not
  trivial to fix since the PMU callback runs in irq context and the
  reset paths are tightly knit into the driver. The test that uncovers
  this is igt@perf_pmu@faulting-read. Drop intel_reset_trylock_no_wait,
  instead use the reset_count to synchronize with gt reset during pmu
  callback. For the ping, continue to use intel_reset_trylock since ping
  is not run in irq context.

- GuC PM timestamp does not tick when GuC is idle. This can potentially
  result in wrong busyness values when a context is active on the
  engine, but GuC is idle. Use the RING TIMESTAMP as GPU timestamp to
  process the GuC busyness stats. This works since both GuC timestamp and
  RING timestamp are synced with the same clock.

- The busyness stats may get updated after the batch starts running.
  This delay causes the busyness reported for 100us duration to fall
  below 95% in the selftest. The only option at this time is to wait for
  GuC busyness to change from idle to active before we sample busyness
  over a 100us period.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  28 +-
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
 .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
 drivers/gpu/drm/i915/gt/intel_gt_pm.c         |   2 +
 drivers/gpu/drm/i915/gt/selftest_engine_pm.c  |  33 +++
 .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
 drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  30 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  21 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   5 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 277 ++++++++++++++++++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
 drivers/gpu/drm/i915/i915_reg.h               |   2 +
 13 files changed, 453 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 2de396e34d83..332756036007 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1915,23 +1915,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	intel_engine_print_breadcrumbs(engine, m);
 }
 
-static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
-					    ktime_t *now)
-{
-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
-	ktime_t total = stats->total;
-
-	/*
-	 * If the engine is executing something at the moment
-	 * add it to the total.
-	 */
-	*now = ktime_get();
-	if (READ_ONCE(stats->active))
-		total = ktime_add(total, ktime_sub(*now, stats->start));
-
-	return total;
-}
-
 /**
  * intel_engine_get_busy_time() - Return current accumulated engine busyness
  * @engine: engine to report on
@@ -1941,16 +1924,7 @@ static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
  */
 ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now)
 {
-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
-	unsigned int seq;
-	ktime_t total;
-
-	do {
-		seq = read_seqcount_begin(&stats->lock);
-		total = __intel_engine_get_busy_time(engine, now);
-	} while (read_seqcount_retry(&stats->lock, seq));
-
-	return total;
+	return engine->busyness(engine, now);
 }
 
 struct intel_context *
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 24fa7fb0e7de..5732e0d71513 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -284,6 +284,28 @@ struct intel_engine_execlists_stats {
 	ktime_t start;
 };
 
+struct intel_engine_guc_stats {
+	/**
+	 * @running: Active state of the engine when busyness was last sampled.
+	 */
+	bool running;
+
+	/**
+	 * @prev_total: Previous value of total runtime clock cycles.
+	 */
+	u32 prev_total;
+
+	/**
+	 * @total_gt_clks: Total gt clock cycles this engine was busy.
+	 */
+	u64 total_gt_clks;
+
+	/**
+	 * @start_gt_clk: GT clock time of last idle to active transition.
+	 */
+	u64 start_gt_clk;
+};
+
 struct intel_engine_cs {
 	struct drm_i915_private *i915;
 	struct intel_gt *gt;
@@ -466,6 +488,12 @@ struct intel_engine_cs {
 	void		(*add_active_request)(struct i915_request *rq);
 	void		(*remove_active_request)(struct i915_request *rq);
 
+	/*
+	 * Get engine busyness and the time at which the busyness was sampled.
+	 */
+	ktime_t		(*busyness)(struct intel_engine_cs *engine,
+				    ktime_t *now);
+
 	struct intel_engine_execlists execlists;
 
 	/*
@@ -515,7 +543,10 @@ struct intel_engine_cs {
 	u32 (*get_cmd_length_mask)(u32 cmd_header);
 
 	struct {
-		struct intel_engine_execlists_stats execlists;
+		union {
+			struct intel_engine_execlists_stats execlists;
+			struct intel_engine_guc_stats guc;
+		};
 
 		/**
 		 * @rps: Utilisation at last RPS sampling.
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index bedb80057046..ca03880fa7e4 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -3293,6 +3293,38 @@ static void execlists_release(struct intel_engine_cs *engine)
 	lrc_fini_wa_ctx(engine);
 }
 
+static ktime_t __execlists_engine_busyness(struct intel_engine_cs *engine,
+					   ktime_t *now)
+{
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
+	ktime_t total = stats->total;
+
+	/*
+	 * If the engine is executing something at the moment
+	 * add it to the total.
+	 */
+	*now = ktime_get();
+	if (READ_ONCE(stats->active))
+		total = ktime_add(total, ktime_sub(*now, stats->start));
+
+	return total;
+}
+
+static ktime_t execlists_engine_busyness(struct intel_engine_cs *engine,
+					 ktime_t *now)
+{
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
+	unsigned int seq;
+	ktime_t total;
+
+	do {
+		seq = read_seqcount_begin(&stats->lock);
+		total = __execlists_engine_busyness(engine, now);
+	} while (read_seqcount_retry(&stats->lock, seq));
+
+	return total;
+}
+
 static void
 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 {
@@ -3349,6 +3381,8 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 		engine->emit_bb_start = gen8_emit_bb_start;
 	else
 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
+
+	engine->busyness = execlists_engine_busyness;
 }
 
 static void logical_ring_default_irqs(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 524eaf678790..b4a8594bc46c 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -86,6 +86,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
 	intel_rc6_unpark(&gt->rc6);
 	intel_rps_unpark(&gt->rps);
 	i915_pmu_gt_unparked(i915);
+	intel_guc_busyness_unpark(gt);
 
 	intel_gt_unpark_requests(gt);
 	runtime_begin(gt);
@@ -104,6 +105,7 @@ static int __gt_park(struct intel_wakeref *wf)
 	runtime_end(gt);
 	intel_gt_park_requests(gt);
 
+	intel_guc_busyness_park(gt);
 	i915_vma_parked(gt);
 	i915_pmu_gt_parked(i915);
 	intel_rps_park(&gt->rps);
diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
index 75569666105d..0bfd738dbf3a 100644
--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
@@ -214,6 +214,31 @@ static int live_engine_timestamps(void *arg)
 	return 0;
 }
 
+static int __spin_until_busier(struct intel_engine_cs *engine, ktime_t busyness)
+{
+	ktime_t start, unused, dt;
+
+	if (!intel_engine_uses_guc(engine))
+		return 0;
+
+	/*
+	 * In GuC mode of submission, the busyness stats may get updated after
+	 * the batch starts running. Poll for a change in busyness and timeout
+	 * after 500 us.
+	 */
+	start = ktime_get();
+	while (intel_engine_get_busy_time(engine, &unused) == busyness) {
+		dt = ktime_get() - start;
+		if (dt > 500000) {
+			pr_err("active wait timed out %lld\n", dt);
+			ENGINE_TRACE(engine, "active wait time out %lld\n", dt);
+			return -ETIME;
+		}
+	}
+
+	return 0;
+}
+
 static int live_engine_busy_stats(void *arg)
 {
 	struct intel_gt *gt = arg;
@@ -232,6 +257,7 @@ static int live_engine_busy_stats(void *arg)
 	GEM_BUG_ON(intel_gt_pm_is_awake(gt));
 	for_each_engine(engine, gt, id) {
 		struct i915_request *rq;
+		ktime_t busyness, dummy;
 		ktime_t de, dt;
 		ktime_t t[2];
 
@@ -274,12 +300,19 @@ static int live_engine_busy_stats(void *arg)
 		}
 		i915_request_add(rq);
 
+		busyness = intel_engine_get_busy_time(engine, &dummy);
 		if (!igt_wait_for_spinner(&spin, rq)) {
 			intel_gt_set_wedged(engine->gt);
 			err = -ETIME;
 			goto end;
 		}
 
+		err = __spin_until_busier(engine, busyness);
+		if (err) {
+			GEM_TRACE_DUMP();
+			goto end;
+		}
+
 		ENGINE_TRACE(engine, "measuring busy time\n");
 		preempt_disable();
 		de = intel_engine_get_busy_time(engine, &t[0]);
diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
index ba10bd374cee..fe5d7d261797 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
@@ -144,6 +144,7 @@ enum intel_guc_action {
 	INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
 	INTEL_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC = 0x4601,
 	INTEL_GUC_ACTION_RESET_CLIENT = 0x5507,
+	INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
 	INTEL_GUC_ACTION_LIMIT
 };
 
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index 31cf9fb48c7e..1cb46098030d 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -138,6 +138,8 @@ struct intel_guc {
 	u32 ads_regset_size;
 	/** @ads_golden_ctxt_size: size of the golden contexts in the ADS */
 	u32 ads_golden_ctxt_size;
+	/** @ads_engine_usage_size: size of engine usage in the ADS */
+	u32 ads_engine_usage_size;
 
 	/** @lrc_desc_pool: object allocated to hold the GuC LRC descriptor pool */
 	struct i915_vma *lrc_desc_pool;
@@ -172,6 +174,34 @@ struct intel_guc {
 
 	/** @send_mutex: used to serialize the intel_guc_send actions */
 	struct mutex send_mutex;
+
+	/**
+	 * @timestamp: GT timestamp object that stores a copy of the timestamp
+	 * and adjusts it for overflow using a worker.
+	 */
+	struct {
+		/**
+		 * @lock: Lock protecting the below fields and the engine stats.
+		 */
+		spinlock_t lock;
+
+		/**
+		 * @gt_stamp: 64 bit extended value of the GT timestamp.
+		 */
+		u64 gt_stamp;
+
+		/**
+		 * @ping_delay: Period for polling the GT timestamp for
+		 * overflow.
+		 */
+		unsigned long ping_delay;
+
+		/**
+		 * @work: Periodic work to adjust GT timestamp, engine and
+		 * context usage for overflows.
+		 */
+		struct delayed_work work;
+	} timestamp;
 };
 
 static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
index 621c893a009f..1a1edae67e4e 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
@@ -26,6 +26,8 @@
  *      | guc_policies                          |
  *      +---------------------------------------+
  *      | guc_gt_system_info                    |
+ *      +---------------------------------------+
+ *      | guc_engine_usage                      |
  *      +---------------------------------------+ <== static
  *      | guc_mmio_reg[countA] (engine 0.0)     |
  *      | guc_mmio_reg[countB] (engine 0.1)     |
@@ -47,6 +49,7 @@ struct __guc_ads_blob {
 	struct guc_ads ads;
 	struct guc_policies policies;
 	struct guc_gt_system_info system_info;
+	struct guc_engine_usage engine_usage;
 	/* From here on, location is dynamic! Refer to above diagram. */
 	struct guc_mmio_reg regset[0];
 } __packed;
@@ -628,3 +631,21 @@ void intel_guc_ads_reset(struct intel_guc *guc)
 
 	guc_ads_private_data_reset(guc);
 }
+
+u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
+{
+	struct __guc_ads_blob *blob = guc->ads_blob;
+	u32 base = intel_guc_ggtt_offset(guc, guc->ads_vma);
+	u32 offset = base + ptr_offset(blob, engine_usage);
+
+	return offset;
+}
+
+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine)
+{
+	struct intel_guc *guc = &engine->gt->uc.guc;
+	struct __guc_ads_blob *blob = guc->ads_blob;
+	u8 guc_class = engine_class_to_guc_class(engine->class);
+
+	return &blob->engine_usage.engines[guc_class][ilog2(engine->logical_mask)];
+}
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
index 3d85051d57e4..e74c110facff 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
@@ -6,8 +6,11 @@
 #ifndef _INTEL_GUC_ADS_H_
 #define _INTEL_GUC_ADS_H_
 
+#include <linux/types.h>
+
 struct intel_guc;
 struct drm_printer;
+struct intel_engine_cs;
 
 int intel_guc_ads_create(struct intel_guc *guc);
 void intel_guc_ads_destroy(struct intel_guc *guc);
@@ -15,5 +18,7 @@ void intel_guc_ads_init_late(struct intel_guc *guc);
 void intel_guc_ads_reset(struct intel_guc *guc);
 void intel_guc_ads_print_policy_info(struct intel_guc *guc,
 				     struct drm_printer *p);
+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine);
+u32 intel_guc_engine_usage_offset(struct intel_guc *guc);
 
 #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
index 722933e26347..7072e30e99f4 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
@@ -294,6 +294,19 @@ struct guc_ads {
 	u32 reserved[15];
 } __packed;
 
+/* Engine usage stats */
+struct guc_engine_usage_record {
+	u32 current_context_index;
+	u32 last_switch_in_stamp;
+	u32 reserved0;
+	u32 total_runtime;
+	u32 reserved1[4];
+} __packed;
+
+struct guc_engine_usage {
+	struct guc_engine_usage_record engines[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
+} __packed;
+
 /* GuC logging structures */
 
 enum guc_log_buffer_type {
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 38b47e73e35d..5cc49c0b3889 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -13,6 +13,7 @@
 #include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_gpu_commands.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_clock_utils.h"
 #include "gt/intel_gt_irq.h"
 #include "gt/intel_gt_pm.h"
 #include "gt/intel_gt_requests.h"
@@ -21,6 +22,7 @@
 #include "gt/intel_mocs.h"
 #include "gt/intel_ring.h"
 
+#include "intel_guc_ads.h"
 #include "intel_guc_submission.h"
 
 #include "i915_drv.h"
@@ -1077,6 +1079,272 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
 	xa_unlock_irqrestore(&guc->context_lookup, flags);
 }
 
+/*
+ * GuC stores busyness stats for each engine at context in/out boundaries. A
+ * context 'in' logs execution start time, 'out' adds in -> out delta to total.
+ * i915/kmd accesses 'start', 'total' and 'context id' from memory shared with
+ * GuC.
+ *
+ * __i915_pmu_event_read samples engine busyness. When sampling, if context id
+ * is valid (!= ~0) and start is non-zero, the engine is considered to be
+ * active. For an active engine total busyness = total + (now - start), where
+ * 'now' is the time at which the busyness is sampled. For inactive engine,
+ * total busyness = total.
+ *
+ * All times are captured from GUCPMTIMESTAMP reg and are in gt clock domain.
+ *
+ * The start and total values provided by GuC are 32 bits and wrap around in a
+ * few minutes. Since perf pmu provides busyness as 64 bit monotonically
+ * increasing ns values, there is a need for this implementation to account for
+ * overflows and extend the GuC provided values to 64 bits before returning
+ * busyness to the user. In order to do that, a worker runs periodically at
+ * frequency = 1/8th the time it takes for the timestamp to wrap (i.e. once in
+ * 27 seconds for a gt clock frequency of 19.2 MHz).
+ */
+
+#define WRAP_TIME_CLKS U32_MAX
+#define POLL_TIME_CLKS (WRAP_TIME_CLKS >> 3)
+
+static void
+__extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start)
+{
+	u32 gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
+	u32 gt_stamp_last = lower_32_bits(guc->timestamp.gt_stamp);
+
+	if (new_start == lower_32_bits(*prev_start))
+		return;
+
+	if (new_start < gt_stamp_last &&
+	    (new_start - gt_stamp_last) <= POLL_TIME_CLKS)
+		gt_stamp_hi++;
+
+	if (new_start > gt_stamp_last &&
+	    (gt_stamp_last - new_start) <= POLL_TIME_CLKS && gt_stamp_hi)
+		gt_stamp_hi--;
+
+	*prev_start = ((u64)gt_stamp_hi << 32) | new_start;
+}
+
+static void guc_update_engine_gt_clks(struct intel_engine_cs *engine)
+{
+	struct guc_engine_usage_record *rec = intel_guc_engine_usage(engine);
+	struct intel_engine_guc_stats *stats = &engine->stats.guc;
+	struct intel_guc *guc = &engine->gt->uc.guc;
+	u32 last_switch = rec->last_switch_in_stamp;
+	u32 ctx_id = rec->current_context_index;
+	u32 total = rec->total_runtime;
+
+	lockdep_assert_held(&guc->timestamp.lock);
+
+	stats->running = ctx_id != ~0U && last_switch;
+	if (stats->running)
+		__extend_last_switch(guc, &stats->start_gt_clk, last_switch);
+
+	/*
+	 * Instead of adjusting the total for overflow, just add the
+	 * difference from previous sample stats->total_gt_clks
+	 */
+	if (total && total != ~0U) {
+		stats->total_gt_clks += (u32)(total - stats->prev_total);
+		stats->prev_total = total;
+	}
+}
+
+static void guc_update_pm_timestamp(struct intel_guc *guc,
+				    struct intel_engine_cs *engine,
+				    ktime_t *now)
+{
+	u32 gt_stamp_now, gt_stamp_hi;
+
+	lockdep_assert_held(&guc->timestamp.lock);
+
+	gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
+	gt_stamp_now = intel_uncore_read(engine->uncore,
+					 RING_TIMESTAMP(engine->mmio_base));
+	*now = ktime_get();
+
+	if (gt_stamp_now < lower_32_bits(guc->timestamp.gt_stamp))
+		gt_stamp_hi++;
+
+	guc->timestamp.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_now;
+}
+
+/*
+ * Unlike the execlist mode of submission total and active times are in terms of
+ * gt clocks. The *now parameter is retained to return the cpu time at which the
+ * busyness was sampled.
+ */
+static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
+{
+	struct intel_engine_guc_stats stats_saved, *stats = &engine->stats.guc;
+	struct i915_gpu_error *gpu_error = &engine->i915->gpu_error;
+	struct intel_gt *gt = engine->gt;
+	struct intel_guc *guc = &gt->uc.guc;
+	u64 total, gt_stamp_saved;
+	unsigned long flags;
+	u32 reset_count;
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	/*
+	 * If a reset happened, we risk reading partially updated
+	 * engine busyness from GuC, so we just use the driver stored
+	 * copy of busyness. Synchronize with gt reset using reset_count.
+	 */
+	reset_count = i915_reset_count(gpu_error);
+
+	*now = ktime_get();
+
+	/*
+	 * The active busyness depends on start_gt_clk and gt_stamp.
+	 * gt_stamp is updated by i915 only when gt is awake and the
+	 * start_gt_clk is derived from GuC state. To get a consistent
+	 * view of activity, we query the GuC state only if gt is awake.
+	 */
+	stats_saved = *stats;
+	gt_stamp_saved = guc->timestamp.gt_stamp;
+	if (intel_gt_pm_get_if_awake(gt)) {
+		guc_update_engine_gt_clks(engine);
+		guc_update_pm_timestamp(guc, engine, now);
+		intel_gt_pm_put_async(gt);
+		if (i915_reset_count(gpu_error) != reset_count) {
+			*stats = stats_saved;
+			guc->timestamp.gt_stamp = gt_stamp_saved;
+		}
+	}
+
+	total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks);
+	if (stats->running) {
+		u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk;
+
+		total += intel_gt_clock_interval_to_ns(gt, clk);
+	}
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+
+	return ns_to_ktime(total);
+}
+
+static void __reset_guc_busyness_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	unsigned long flags;
+	ktime_t unused;
+
+	cancel_delayed_work_sync(&guc->timestamp.work);
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	for_each_engine(engine, gt, id) {
+		guc_update_pm_timestamp(guc, engine, &unused);
+		guc_update_engine_gt_clks(engine);
+		engine->stats.guc.prev_total = 0;
+	}
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
+static void __update_guc_busyness_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	ktime_t unused;
+
+	for_each_engine(engine, gt, id) {
+		guc_update_pm_timestamp(guc, engine, &unused);
+		guc_update_engine_gt_clks(engine);
+	}
+}
+
+static void guc_timestamp_ping(struct work_struct *wrk)
+{
+	struct intel_guc *guc = container_of(wrk, typeof(*guc),
+					     timestamp.work.work);
+	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
+	struct intel_gt *gt = guc_to_gt(guc);
+	intel_wakeref_t wakeref;
+	unsigned long flags;
+	int srcu, ret;
+
+	/*
+	 * Synchronize with gt reset to make sure the worker does not
+	 * corrupt the engine/guc stats.
+	 */
+	ret = intel_gt_reset_trylock(gt, &srcu);
+	if (ret)
+		return;
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
+		__update_guc_busyness_stats(guc);
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+
+	intel_gt_reset_unlock(gt, srcu);
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+}
+
+static int guc_action_enable_usage_stats(struct intel_guc *guc)
+{
+	u32 offset = intel_guc_engine_usage_offset(guc);
+	u32 action[] = {
+		INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF,
+		offset,
+		0,
+	};
+
+	return intel_guc_send(guc, action, ARRAY_SIZE(action));
+}
+
+static void guc_init_engine_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	intel_wakeref_t wakeref;
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+
+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref) {
+		int ret = guc_action_enable_usage_stats(guc);
+
+		if (ret)
+			drm_err(&gt->i915->drm,
+				"Failed to enable usage stats: %d!\n", ret);
+	}
+}
+
+void intel_guc_busyness_park(struct intel_gt *gt)
+{
+	struct intel_guc *guc = &gt->uc.guc;
+	unsigned long flags;
+
+	if (!guc_submission_initialized(guc))
+		return;
+
+	cancel_delayed_work(&guc->timestamp.work);
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+	__update_guc_busyness_stats(guc);
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
+void intel_guc_busyness_unpark(struct intel_gt *gt)
+{
+	struct intel_guc *guc = &gt->uc.guc;
+
+	if (!guc_submission_initialized(guc))
+		return;
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+}
+
 static inline bool
 submission_disabled(struct intel_guc *guc)
 {
@@ -1138,6 +1406,7 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc)
 	intel_gt_park_heartbeats(guc_to_gt(guc));
 	disable_submission(guc);
 	guc->interrupts.disable(guc);
+	__reset_guc_busyness_stats(guc);
 
 	/* Flush IRQ handler */
 	spin_lock_irq(&guc_to_gt(guc)->irq_lock);
@@ -1484,6 +1753,7 @@ static void destroyed_worker_func(struct work_struct *w);
  */
 int intel_guc_submission_init(struct intel_guc *guc)
 {
+	struct intel_gt *gt = guc_to_gt(guc);
 	int ret;
 
 	if (guc->lrc_desc_pool)
@@ -1512,6 +1782,10 @@ int intel_guc_submission_init(struct intel_guc *guc)
 	if (!guc->submission_state.guc_ids_bitmap)
 		return -ENOMEM;
 
+	spin_lock_init(&guc->timestamp.lock);
+	INIT_DELAYED_WORK(&guc->timestamp.work, guc_timestamp_ping);
+	guc->timestamp.ping_delay = (POLL_TIME_CLKS / gt->clock_frequency + 1) * HZ;
+
 	return 0;
 }
 
@@ -3369,7 +3643,9 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
 		engine->emit_flush = gen12_emit_flush_xcs;
 	}
 	engine->set_default_submission = guc_set_default_submission;
+	engine->busyness = guc_engine_busyness;
 
+	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
 	engine->flags |= I915_ENGINE_HAS_PREEMPTION;
 	engine->flags |= I915_ENGINE_HAS_TIMESLICES;
 
@@ -3468,6 +3744,7 @@ int intel_guc_submission_setup(struct intel_engine_cs *engine)
 void intel_guc_submission_enable(struct intel_guc *guc)
 {
 	guc_init_lrc_mapping(guc);
+	guc_init_engine_stats(guc);
 }
 
 void intel_guc_submission_disable(struct intel_guc *guc)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
index c7ef44fa0c36..5a95a9f0a8e3 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
@@ -28,6 +28,8 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
 void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
 				    struct i915_request *hung_rq,
 				    struct drm_printer *m);
+void intel_guc_busyness_park(struct intel_gt *gt);
+void intel_guc_busyness_unpark(struct intel_gt *gt);
 
 bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
 
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index d9f7a729333f..f7927f6dac6e 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -2662,6 +2662,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   RING_WAIT		(1 << 11) /* gen3+, PRBx_CTL */
 #define   RING_WAIT_SEMAPHORE	(1 << 10) /* gen6+ */
 
+#define GUCPMTIMESTAMP          _MMIO(0xC3E8)
+
 /* There are 16 64-bit CS General Purpose Registers per-engine on Gen8+ */
 #define GEN8_RING_CS_GPR(base, n)	_MMIO((base) + 0x600 + (n) * 8)
 #define GEN8_RING_CS_GPR_UDW(base, n)	_MMIO((base) + 0x600 + (n) * 8 + 4)
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-15  1:18 [PATCH 1/2] drm/i915/pmu: Add a name to the execlists stats Umesh Nerlige Ramappa
@ 2021-10-15  1:18 ` Umesh Nerlige Ramappa
  0 siblings, 0 replies; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-10-15  1:18 UTC (permalink / raw)
  To: intel-gfx, dri-devel
  Cc: john.c.harrison, Tvrtko Ursulin, daniel.vetter, Matthew Brost

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

	engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an overaccounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at guc level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate guc and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of guc state
- Add hooks to gt park/unpark for guc busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to guc initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and guc stats objects
- Since disable_submission is called from many places, move resetting
  stats to intel_guc_submission_reset_prepare

v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
  callbacks and worker with gt reset

v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
  called during i915 load. This ends up calling the guc busyness unpark
  hook and results in kiskstarting an uninitialized worker. Let
  park/unpark hooks check if guc submission has been initialized.
- drop cant_sleep() from trylock hepler since rcu_read_lock takes care
  of that.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  28 +-
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
 .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
 drivers/gpu/drm/i915/gt/intel_gt_pm.c         |   2 +
 drivers/gpu/drm/i915/gt/intel_reset.c         |  15 +
 drivers/gpu/drm/i915/gt/intel_reset.h         |   1 +
 .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
 drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  30 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  21 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   5 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 273 ++++++++++++++++++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
 drivers/gpu/drm/i915/i915_reg.h               |   2 +
 14 files changed, 432 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 38436f4b5706..6b783fdcba2a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	intel_engine_print_breadcrumbs(engine, m);
 }
 
-static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
-					    ktime_t *now)
-{
-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
-	ktime_t total = stats->total;
-
-	/*
-	 * If the engine is executing something at the moment
-	 * add it to the total.
-	 */
-	*now = ktime_get();
-	if (READ_ONCE(stats->active))
-		total = ktime_add(total, ktime_sub(*now, stats->start));
-
-	return total;
-}
-
 /**
  * intel_engine_get_busy_time() - Return current accumulated engine busyness
  * @engine: engine to report on
@@ -1899,16 +1882,7 @@ static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
  */
 ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now)
 {
-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
-	unsigned int seq;
-	ktime_t total;
-
-	do {
-		seq = read_seqcount_begin(&stats->lock);
-		total = __intel_engine_get_busy_time(engine, now);
-	} while (read_seqcount_retry(&stats->lock, seq));
-
-	return total;
+	return engine->busyness(engine, now);
 }
 
 struct intel_context *
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index b820a2c1124e..9300c65d6675 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -284,6 +284,28 @@ struct intel_engine_execlists_stats {
 	ktime_t start;
 };
 
+struct intel_engine_guc_stats {
+	/**
+	 * @running: Active state of the engine when busyness was last sampled.
+	 */
+	bool running;
+
+	/**
+	 * @prev_total: Previous value of total runtime clock cycles.
+	 */
+	u32 prev_total;
+
+	/**
+	 * @total_gt_clks: Total gt clock cycles this engine was busy.
+	 */
+	u64 total_gt_clks;
+
+	/**
+	 * @start_gt_clk: GT clock time of last idle to active transition.
+	 */
+	u64 start_gt_clk;
+};
+
 struct intel_engine_cs {
 	struct drm_i915_private *i915;
 	struct intel_gt *gt;
@@ -459,6 +481,12 @@ struct intel_engine_cs {
 	void		(*add_active_request)(struct i915_request *rq);
 	void		(*remove_active_request)(struct i915_request *rq);
 
+	/*
+	 * Get engine busyness and the time at which the busyness was sampled.
+	 */
+	ktime_t		(*busyness)(struct intel_engine_cs *engine,
+				    ktime_t *now);
+
 	struct intel_engine_execlists execlists;
 
 	/*
@@ -508,7 +536,10 @@ struct intel_engine_cs {
 	u32 (*get_cmd_length_mask)(u32 cmd_header);
 
 	struct {
-		struct intel_engine_execlists_stats execlists;
+		union {
+			struct intel_engine_execlists_stats execlists;
+			struct intel_engine_guc_stats guc;
+		};
 
 		/**
 		 * @rps: Utilisation at last RPS sampling.
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 73a79c2acd3a..e8ffcf36f6f4 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -3292,6 +3292,38 @@ static void execlists_release(struct intel_engine_cs *engine)
 	lrc_fini_wa_ctx(engine);
 }
 
+static ktime_t __execlists_engine_busyness(struct intel_engine_cs *engine,
+					   ktime_t *now)
+{
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
+	ktime_t total = stats->total;
+
+	/*
+	 * If the engine is executing something at the moment
+	 * add it to the total.
+	 */
+	*now = ktime_get();
+	if (READ_ONCE(stats->active))
+		total = ktime_add(total, ktime_sub(*now, stats->start));
+
+	return total;
+}
+
+static ktime_t execlists_engine_busyness(struct intel_engine_cs *engine,
+					 ktime_t *now)
+{
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
+	unsigned int seq;
+	ktime_t total;
+
+	do {
+		seq = read_seqcount_begin(&stats->lock);
+		total = __execlists_engine_busyness(engine, now);
+	} while (read_seqcount_retry(&stats->lock, seq));
+
+	return total;
+}
+
 static void
 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 {
@@ -3348,6 +3380,8 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 		engine->emit_bb_start = gen8_emit_bb_start;
 	else
 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
+
+	engine->busyness = execlists_engine_busyness;
 }
 
 static void logical_ring_default_irqs(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 524eaf678790..b4a8594bc46c 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -86,6 +86,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
 	intel_rc6_unpark(&gt->rc6);
 	intel_rps_unpark(&gt->rps);
 	i915_pmu_gt_unparked(i915);
+	intel_guc_busyness_unpark(gt);
 
 	intel_gt_unpark_requests(gt);
 	runtime_begin(gt);
@@ -104,6 +105,7 @@ static int __gt_park(struct intel_wakeref *wf)
 	runtime_end(gt);
 	intel_gt_park_requests(gt);
 
+	intel_guc_busyness_park(gt);
 	i915_vma_parked(gt);
 	i915_pmu_gt_parked(i915);
 	intel_rps_park(&gt->rps);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index 91200c43951f..37b4e6b852a6 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -1389,6 +1389,21 @@ void intel_gt_handle_error(struct intel_gt *gt,
 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
 }
 
+bool intel_gt_reset_trylock_no_wait(struct intel_gt *gt, int *srcu)
+{
+	int reset_in_progress;
+
+	might_lock(&gt->reset.backoff_srcu);
+
+	rcu_read_lock();
+	reset_in_progress = test_bit(I915_RESET_BACKOFF, &gt->reset.flags);
+	if (!reset_in_progress)
+		*srcu = srcu_read_lock(&gt->reset.backoff_srcu);
+	rcu_read_unlock();
+
+	return reset_in_progress;
+}
+
 int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
 {
 	might_lock(&gt->reset.backoff_srcu);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h b/drivers/gpu/drm/i915/gt/intel_reset.h
index adc734e67387..4f5f4c00c54f 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.h
+++ b/drivers/gpu/drm/i915/gt/intel_reset.h
@@ -38,6 +38,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs *engine,
 
 void __i915_request_reset(struct i915_request *rq, bool guilty);
 
+bool __must_check intel_gt_reset_trylock_no_wait(struct intel_gt *gt, int *srcu);
 int __must_check intel_gt_reset_trylock(struct intel_gt *gt, int *srcu);
 void intel_gt_reset_unlock(struct intel_gt *gt, int tag);
 
diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
index 8ff582222aff..ff1311d4beff 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
@@ -143,6 +143,7 @@ enum intel_guc_action {
 	INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506,
 	INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
 	INTEL_GUC_ACTION_RESET_CLIENT = 0x5507,
+	INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
 	INTEL_GUC_ACTION_LIMIT
 };
 
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index 5dd174babf7a..3c3d48c7d5de 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -104,6 +104,8 @@ struct intel_guc {
 	u32 ads_regset_size;
 	/** @ads_golden_ctxt_size: size of the golden contexts in the ADS */
 	u32 ads_golden_ctxt_size;
+	/** @ads_engine_usage_size: size of engine usage in the ADS */
+	u32 ads_engine_usage_size;
 
 	/** @lrc_desc_pool: object allocated to hold the GuC LRC descriptor pool */
 	struct i915_vma *lrc_desc_pool;
@@ -138,6 +140,34 @@ struct intel_guc {
 
 	/** @send_mutex: used to serialize the intel_guc_send actions */
 	struct mutex send_mutex;
+
+	/**
+	 * @timestamp: GT timestamp object that stores a copy of the timestamp
+	 * and adjusts it for overflow using a worker.
+	 */
+	struct {
+		/**
+		 * @lock: Lock protecting the below fields and the engine stats.
+		 */
+		spinlock_t lock;
+
+		/**
+		 * @gt_stamp: 64 bit extended value of the GT timestamp.
+		 */
+		u64 gt_stamp;
+
+		/**
+		 * @ping_delay: Period for polling the GT timestamp for
+		 * overflow.
+		 */
+		unsigned long ping_delay;
+
+		/**
+		 * @work: Periodic work to adjust GT timestamp, engine and
+		 * context usage for overflows.
+		 */
+		struct delayed_work work;
+	} timestamp;
 };
 
 static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
index 2c6ea64af7ec..ca9ab53999d5 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
@@ -26,6 +26,8 @@
  *      | guc_policies                          |
  *      +---------------------------------------+
  *      | guc_gt_system_info                    |
+ *      +---------------------------------------+
+ *      | guc_engine_usage                      |
  *      +---------------------------------------+ <== static
  *      | guc_mmio_reg[countA] (engine 0.0)     |
  *      | guc_mmio_reg[countB] (engine 0.1)     |
@@ -47,6 +49,7 @@ struct __guc_ads_blob {
 	struct guc_ads ads;
 	struct guc_policies policies;
 	struct guc_gt_system_info system_info;
+	struct guc_engine_usage engine_usage;
 	/* From here on, location is dynamic! Refer to above diagram. */
 	struct guc_mmio_reg regset[0];
 } __packed;
@@ -628,3 +631,21 @@ void intel_guc_ads_reset(struct intel_guc *guc)
 
 	guc_ads_private_data_reset(guc);
 }
+
+u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
+{
+	struct __guc_ads_blob *blob = guc->ads_blob;
+	u32 base = intel_guc_ggtt_offset(guc, guc->ads_vma);
+	u32 offset = base + ptr_offset(blob, engine_usage);
+
+	return offset;
+}
+
+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine)
+{
+	struct intel_guc *guc = &engine->gt->uc.guc;
+	struct __guc_ads_blob *blob = guc->ads_blob;
+	u8 guc_class = engine_class_to_guc_class(engine->class);
+
+	return &blob->engine_usage.engines[guc_class][engine->instance];
+}
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
index 3d85051d57e4..e74c110facff 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
@@ -6,8 +6,11 @@
 #ifndef _INTEL_GUC_ADS_H_
 #define _INTEL_GUC_ADS_H_
 
+#include <linux/types.h>
+
 struct intel_guc;
 struct drm_printer;
+struct intel_engine_cs;
 
 int intel_guc_ads_create(struct intel_guc *guc);
 void intel_guc_ads_destroy(struct intel_guc *guc);
@@ -15,5 +18,7 @@ void intel_guc_ads_init_late(struct intel_guc *guc);
 void intel_guc_ads_reset(struct intel_guc *guc);
 void intel_guc_ads_print_policy_info(struct intel_guc *guc,
 				     struct drm_printer *p);
+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine);
+u32 intel_guc_engine_usage_offset(struct intel_guc *guc);
 
 #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
index fa4be13c8854..7c9c081670fc 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
@@ -294,6 +294,19 @@ struct guc_ads {
 	u32 reserved[15];
 } __packed;
 
+/* Engine usage stats */
+struct guc_engine_usage_record {
+	u32 current_context_index;
+	u32 last_switch_in_stamp;
+	u32 reserved0;
+	u32 total_runtime;
+	u32 reserved1[4];
+} __packed;
+
+struct guc_engine_usage {
+	struct guc_engine_usage_record engines[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
+} __packed;
+
 /* GuC logging structures */
 
 enum guc_log_buffer_type {
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index ba0de35f6323..f0d09feff14e 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -12,6 +12,7 @@
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_clock_utils.h"
 #include "gt/intel_gt_irq.h"
 #include "gt/intel_gt_pm.h"
 #include "gt/intel_gt_requests.h"
@@ -20,6 +21,7 @@
 #include "gt/intel_mocs.h"
 #include "gt/intel_ring.h"
 
+#include "intel_guc_ads.h"
 #include "intel_guc_submission.h"
 
 #include "i915_drv.h"
@@ -750,6 +752,268 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
 	xa_unlock_irqrestore(&guc->context_lookup, flags);
 }
 
+/*
+ * GuC stores busyness stats for each engine at context in/out boundaries. A
+ * context 'in' logs execution start time, 'out' adds in -> out delta to total.
+ * i915/kmd accesses 'start', 'total' and 'context id' from memory shared with
+ * GuC.
+ *
+ * __i915_pmu_event_read samples engine busyness. When sampling, if context id
+ * is valid (!= ~0) and start is non-zero, the engine is considered to be
+ * active. For an active engine total busyness = total + (now - start), where
+ * 'now' is the time at which the busyness is sampled. For inactive engine,
+ * total busyness = total.
+ *
+ * All times are captured from GUCPMTIMESTAMP reg and are in gt clock domain.
+ *
+ * The start and total values provided by GuC are 32 bits and wrap around in a
+ * few minutes. Since perf pmu provides busyness as 64 bit monotonically
+ * increasing ns values, there is a need for this implementation to account for
+ * overflows and extend the GuC provided values to 64 bits before returning
+ * busyness to the user. In order to do that, a worker runs periodically at
+ * frequency = 1/8th the time it takes for the timestamp to wrap (i.e. once in
+ * 27 seconds for a gt clock frequency of 19.2 MHz).
+ */
+
+#define WRAP_TIME_CLKS U32_MAX
+#define POLL_TIME_CLKS (WRAP_TIME_CLKS >> 3)
+
+static void
+__extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start)
+{
+	u32 gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
+	u32 gt_stamp_last = lower_32_bits(guc->timestamp.gt_stamp);
+
+	if (new_start == lower_32_bits(*prev_start))
+		return;
+
+	if (new_start < gt_stamp_last &&
+	    (new_start - gt_stamp_last) <= POLL_TIME_CLKS)
+		gt_stamp_hi++;
+
+	if (new_start > gt_stamp_last &&
+	    (gt_stamp_last - new_start) <= POLL_TIME_CLKS && gt_stamp_hi)
+		gt_stamp_hi--;
+
+	*prev_start = ((u64)gt_stamp_hi << 32) | new_start;
+}
+
+static void guc_update_engine_gt_clks(struct intel_engine_cs *engine)
+{
+	struct guc_engine_usage_record *rec = intel_guc_engine_usage(engine);
+	struct intel_engine_guc_stats *stats = &engine->stats.guc;
+	struct intel_guc *guc = &engine->gt->uc.guc;
+	u32 last_switch = rec->last_switch_in_stamp;
+	u32 ctx_id = rec->current_context_index;
+	u32 total = rec->total_runtime;
+
+	lockdep_assert_held(&guc->timestamp.lock);
+
+	stats->running = ctx_id != ~0U && last_switch;
+	if (stats->running)
+		__extend_last_switch(guc, &stats->start_gt_clk, last_switch);
+
+	/*
+	 * Instead of adjusting the total for overflow, just add the
+	 * difference from previous sample stats->total_gt_clks
+	 */
+	if (total && total != ~0U) {
+		stats->total_gt_clks += (u32)(total - stats->prev_total);
+		stats->prev_total = total;
+	}
+}
+
+static void guc_update_pm_timestamp(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	u32 gt_stamp_now, gt_stamp_hi;
+
+	lockdep_assert_held(&guc->timestamp.lock);
+
+	gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
+	gt_stamp_now = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
+
+	if (gt_stamp_now < lower_32_bits(guc->timestamp.gt_stamp))
+		gt_stamp_hi++;
+
+	guc->timestamp.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_now;
+}
+
+/*
+ * Unlike the execlist mode of submission total and active times are in terms of
+ * gt clocks. The *now parameter is retained to return the cpu time at which the
+ * busyness was sampled.
+ */
+static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
+{
+	struct intel_engine_guc_stats *stats = &engine->stats.guc;
+	struct intel_gt *gt = engine->gt;
+	struct intel_guc *guc = &gt->uc.guc;
+	unsigned long flags;
+	bool reset_in_progress;
+	u64 total;
+	int srcu;
+
+	/*
+	 * If a reset is in progress, we risk reading partially updated
+	 * engine busyness from GuC, so we just use the driver stored
+	 * copy of busyness. Synchronize with gt reset lock to achieve
+	 * this.
+	 */
+	reset_in_progress = intel_gt_reset_trylock_no_wait(gt, &srcu);
+
+	/*
+	 * The order of taking the reset lock first and then the
+	 * timestamp lock is intentional to avoid lock inversion related
+	 * issues.
+	 */
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	*now = ktime_get();
+
+	/*
+	 * The active busyness depends on start_gt_clk and gt_stamp.
+	 * gt_stamp is updated by i915 only when gt is awake and the
+	 * start_gt_clk is derived from GuC state. To get a consistent
+	 * view of activity, we query the GuC state only if gt is awake.
+	 */
+	if (intel_gt_pm_get_if_awake(gt) && !reset_in_progress) {
+		guc_update_engine_gt_clks(engine);
+		guc_update_pm_timestamp(guc);
+		intel_gt_pm_put_async(gt);
+	}
+
+	total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks);
+	if (stats->running) {
+		u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk;
+
+		total += intel_gt_clock_interval_to_ns(gt, clk);
+	}
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+	if (!reset_in_progress)
+		intel_gt_reset_unlock(gt, srcu);
+
+	return ns_to_ktime(total);
+}
+
+static void __reset_guc_busyness_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	unsigned long flags;
+
+	cancel_delayed_work_sync(&guc->timestamp.work);
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	guc_update_pm_timestamp(guc);
+	for_each_engine(engine, gt, id) {
+		guc_update_engine_gt_clks(engine);
+		engine->stats.guc.prev_total = 0;
+	}
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
+static void __update_guc_busyness_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	guc_update_pm_timestamp(guc);
+	for_each_engine(engine, gt, id)
+		guc_update_engine_gt_clks(engine);
+}
+
+static void guc_timestamp_ping(struct work_struct *wrk)
+{
+	struct intel_guc *guc = container_of(wrk, typeof(*guc),
+					     timestamp.work.work);
+	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
+	struct intel_gt *gt = guc_to_gt(guc);
+	intel_wakeref_t wakeref;
+	unsigned long flags;
+	int srcu, ret;
+
+	/*
+	 * Synchronize with gt reset to make sure the worker does not
+	 * corrupt the engine/guc stats.
+	 */
+	ret = intel_gt_reset_trylock(gt, &srcu);
+	if (ret)
+		return;
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
+		__update_guc_busyness_stats(guc);
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+
+	intel_gt_reset_unlock(gt, srcu);
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+}
+
+static int guc_action_enable_usage_stats(struct intel_guc *guc)
+{
+	u32 offset = intel_guc_engine_usage_offset(guc);
+	u32 action[] = {
+		INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF,
+		offset,
+		0,
+	};
+
+	return intel_guc_send(guc, action, ARRAY_SIZE(action));
+}
+
+static void guc_init_engine_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	intel_wakeref_t wakeref;
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+
+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref) {
+		int ret = guc_action_enable_usage_stats(guc);
+
+		if (ret)
+			drm_err(&gt->i915->drm,
+				"Failed to enable usage stats: %d!\n", ret);
+	}
+}
+
+void intel_guc_busyness_park(struct intel_gt *gt)
+{
+	struct intel_guc *guc = &gt->uc.guc;
+	unsigned long flags;
+
+	if (!guc_submission_initialized(guc))
+		return;
+
+	cancel_delayed_work(&guc->timestamp.work);
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+	__update_guc_busyness_stats(guc);
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
+void intel_guc_busyness_unpark(struct intel_gt *gt)
+{
+	struct intel_guc *guc = &gt->uc.guc;
+
+	if (!guc_submission_initialized(guc))
+		return;
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+}
+
 static inline bool
 submission_disabled(struct intel_guc *guc)
 {
@@ -809,6 +1073,7 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc)
 	intel_gt_park_heartbeats(guc_to_gt(guc));
 	disable_submission(guc);
 	guc->interrupts.disable(guc);
+	__reset_guc_busyness_stats(guc);
 
 	/* Flush IRQ handler */
 	spin_lock_irq(&guc_to_gt(guc)->irq_lock);
@@ -1132,6 +1397,7 @@ void intel_guc_submission_reset_finish(struct intel_guc *guc)
  */
 int intel_guc_submission_init(struct intel_guc *guc)
 {
+	struct intel_gt *gt = guc_to_gt(guc);
 	int ret;
 
 	if (guc->lrc_desc_pool)
@@ -1152,6 +1418,10 @@ int intel_guc_submission_init(struct intel_guc *guc)
 	INIT_LIST_HEAD(&guc->guc_id_list);
 	ida_init(&guc->guc_ids);
 
+	spin_lock_init(&guc->timestamp.lock);
+	INIT_DELAYED_WORK(&guc->timestamp.work, guc_timestamp_ping);
+	guc->timestamp.ping_delay = (POLL_TIME_CLKS / gt->clock_frequency + 1) * HZ;
+
 	return 0;
 }
 
@@ -2606,7 +2876,9 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
 		engine->emit_flush = gen12_emit_flush_xcs;
 	}
 	engine->set_default_submission = guc_set_default_submission;
+	engine->busyness = guc_engine_busyness;
 
+	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
 	engine->flags |= I915_ENGINE_HAS_PREEMPTION;
 	engine->flags |= I915_ENGINE_HAS_TIMESLICES;
 
@@ -2705,6 +2977,7 @@ int intel_guc_submission_setup(struct intel_engine_cs *engine)
 void intel_guc_submission_enable(struct intel_guc *guc)
 {
 	guc_init_lrc_mapping(guc);
+	guc_init_engine_stats(guc);
 }
 
 void intel_guc_submission_disable(struct intel_guc *guc)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
index c7ef44fa0c36..5a95a9f0a8e3 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
@@ -28,6 +28,8 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
 void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
 				    struct i915_request *hung_rq,
 				    struct drm_printer *m);
+void intel_guc_busyness_park(struct intel_gt *gt);
+void intel_guc_busyness_unpark(struct intel_gt *gt);
 
 bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
 
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index a897f4abea0c..9aee08425382 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -2664,6 +2664,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   RING_WAIT		(1 << 11) /* gen3+, PRBx_CTL */
 #define   RING_WAIT_SEMAPHORE	(1 << 10) /* gen6+ */
 
+#define GUCPMTIMESTAMP          _MMIO(0xC3E8)
+
 /* There are 16 64-bit CS General Purpose Registers per-engine on Gen8+ */
 #define GEN8_RING_CS_GPR(base, n)	_MMIO((base) + 0x600 + (n) * 8)
 #define GEN8_RING_CS_GPR_UDW(base, n)	_MMIO((base) + 0x600 + (n) * 8 + 4)
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-14  8:21   ` Tvrtko Ursulin
@ 2021-10-15  1:01     ` Umesh Nerlige Ramappa
  0 siblings, 0 replies; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-10-15  1:01 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: intel-gfx, dri-devel, john.c.harrison, daniel.vetter, Matthew Brost

On Thu, Oct 14, 2021 at 09:21:28AM +0100, Tvrtko Ursulin wrote:
>
>On 13/10/2021 01:56, Umesh Nerlige Ramappa wrote:
>>With GuC handling scheduling, i915 is not aware of the time that a
>>context is scheduled in and out of the engine. Since i915 pmu relies on
>>this info to provide engine busyness to the user, GuC shares this info
>>with i915 for all engines using shared memory. For each engine, this
>>info contains:
>>
>>- total busyness: total time that the context was running (total)
>>- id: id of the running context (id)
>>- start timestamp: timestamp when the context started running (start)
>>
>>At the time (now) of sampling the engine busyness, if the id is valid
>>(!= ~0), and start is non-zero, then the context is considered to be
>>active and the engine busyness is calculated using the below equation
>>
>>	engine busyness = total + (now - start)
>>
>>All times are obtained from the gt clock base. For inactive contexts,
>>engine busyness is just equal to the total.
>>
>>The start and total values provided by GuC are 32 bits and wrap around
>>in a few minutes. Since perf pmu provides busyness as 64 bit
>>monotonically increasing values, there is a need for this implementation
>>to account for overflows and extend the time to 64 bits before returning
>>busyness to the user. In order to do that, a worker runs periodically at
>>frequency = 1/8th the time it takes for the timestamp to wrap. As an
>>example, that would be once in 27 seconds for a gt clock frequency of
>>19.2 MHz.
>>
>>Note:
>>There might be an overaccounting of busyness due to the fact that GuC
>>may be updating the total and start values while kmd is reading them.
>>(i.e kmd may read the updated total and the stale start). In such a
>>case, user may see higher busyness value followed by smaller ones which
>>would eventually catch up to the higher value.
>>
>>v2: (Tvrtko)
>>- Include details in commit message
>>- Move intel engine busyness function into execlist code
>>- Use union inside engine->stats
>>- Use natural type for ping delay jiffies
>>- Drop active_work condition checks
>>- Use for_each_engine if iterating all engines
>>- Drop seq locking, use spinlock at guc level to update engine stats
>>- Document worker specific details
>>
>>v3: (Tvrtko/Umesh)
>>- Demarcate guc and execlist stat objects with comments
>>- Document known over-accounting issue in commit
>>- Provide a consistent view of guc state
>>- Add hooks to gt park/unpark for guc busyness
>>- Stop/start worker in gt park/unpark path
>>- Drop inline
>>- Move spinlock and worker inits to guc initialization
>>- Drop helpers that are called only once
>>
>>v4: (Tvrtko/Matt/Umesh)
>>- Drop addressed opens from commit message
>>- Get runtime pm in ping, remove from the park path
>>- Use cancel_delayed_work_sync in disable_submission path
>>- Update stats during reset prepare
>>- Skip ping if reset in progress
>>- Explicitly name execlists and guc stats objects
>>- Since disable_submission is called from many places, move resetting
>>   stats to intel_guc_submission_reset_prepare
>>
>>v5: (Tvrtko)
>>- Add a trylock helper that does not sleep and synchronize PMU event
>>   callbacks and worker with gt reset
>>
>>Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>---
>>  drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  28 +-
>>  drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
>>  .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
>>  drivers/gpu/drm/i915/gt/intel_gt_pm.c         |   2 +
>>  drivers/gpu/drm/i915/gt/intel_reset.c         |  16 ++
>>  drivers/gpu/drm/i915/gt/intel_reset.h         |   1 +
>>  .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
>>  drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  30 ++
>>  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  21 ++
>>  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   5 +
>>  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
>>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 267 ++++++++++++++++++
>>  .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
>>  drivers/gpu/drm/i915/i915_reg.h               |   2 +
>>  14 files changed, 427 insertions(+), 28 deletions(-)
>>
>>diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>index 38436f4b5706..6b783fdcba2a 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>@@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>>  	intel_engine_print_breadcrumbs(engine, m);
>>  }
>>-static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
>>-					    ktime_t *now)
>>-{
>>-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
>>-	ktime_t total = stats->total;
>>-
>>-	/*
>>-	 * If the engine is executing something at the moment
>>-	 * add it to the total.
>>-	 */
>>-	*now = ktime_get();
>>-	if (READ_ONCE(stats->active))
>>-		total = ktime_add(total, ktime_sub(*now, stats->start));
>>-
>>-	return total;
>>-}
>>-
>>  /**
>>   * intel_engine_get_busy_time() - Return current accumulated engine busyness
>>   * @engine: engine to report on
>>@@ -1899,16 +1882,7 @@ static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
>>   */
>>  ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now)
>>  {
>>-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
>>-	unsigned int seq;
>>-	ktime_t total;
>>-
>>-	do {
>>-		seq = read_seqcount_begin(&stats->lock);
>>-		total = __intel_engine_get_busy_time(engine, now);
>>-	} while (read_seqcount_retry(&stats->lock, seq));
>>-
>>-	return total;
>>+	return engine->busyness(engine, now);
>>  }
>>  struct intel_context *
>>diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>index b820a2c1124e..9300c65d6675 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>@@ -284,6 +284,28 @@ struct intel_engine_execlists_stats {
>>  	ktime_t start;
>>  };
>>+struct intel_engine_guc_stats {
>>+	/**
>>+	 * @running: Active state of the engine when busyness was last sampled.
>>+	 */
>>+	bool running;
>>+
>>+	/**
>>+	 * @prev_total: Previous value of total runtime clock cycles.
>>+	 */
>>+	u32 prev_total;
>>+
>>+	/**
>>+	 * @total_gt_clks: Total gt clock cycles this engine was busy.
>>+	 */
>>+	u64 total_gt_clks;
>>+
>>+	/**
>>+	 * @start_gt_clk: GT clock time of last idle to active transition.
>>+	 */
>>+	u64 start_gt_clk;
>>+};
>>+
>>  struct intel_engine_cs {
>>  	struct drm_i915_private *i915;
>>  	struct intel_gt *gt;
>>@@ -459,6 +481,12 @@ struct intel_engine_cs {
>>  	void		(*add_active_request)(struct i915_request *rq);
>>  	void		(*remove_active_request)(struct i915_request *rq);
>>+	/*
>>+	 * Get engine busyness and the time at which the busyness was sampled.
>>+	 */
>>+	ktime_t		(*busyness)(struct intel_engine_cs *engine,
>>+				    ktime_t *now);
>>+
>>  	struct intel_engine_execlists execlists;
>>  	/*
>>@@ -508,7 +536,10 @@ struct intel_engine_cs {
>>  	u32 (*get_cmd_length_mask)(u32 cmd_header);
>>  	struct {
>>-		struct intel_engine_execlists_stats execlists;
>>+		union {
>>+			struct intel_engine_execlists_stats execlists;
>>+			struct intel_engine_guc_stats guc;
>>+		};
>>  		/**
>>  		 * @rps: Utilisation at last RPS sampling.
>>diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>index 73a79c2acd3a..e8ffcf36f6f4 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>@@ -3292,6 +3292,38 @@ static void execlists_release(struct intel_engine_cs *engine)
>>  	lrc_fini_wa_ctx(engine);
>>  }
>>+static ktime_t __execlists_engine_busyness(struct intel_engine_cs *engine,
>>+					   ktime_t *now)
>>+{
>>+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
>>+	ktime_t total = stats->total;
>>+
>>+	/*
>>+	 * If the engine is executing something at the moment
>>+	 * add it to the total.
>>+	 */
>>+	*now = ktime_get();
>>+	if (READ_ONCE(stats->active))
>>+		total = ktime_add(total, ktime_sub(*now, stats->start));
>>+
>>+	return total;
>>+}
>>+
>>+static ktime_t execlists_engine_busyness(struct intel_engine_cs *engine,
>>+					 ktime_t *now)
>>+{
>>+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
>>+	unsigned int seq;
>>+	ktime_t total;
>>+
>>+	do {
>>+		seq = read_seqcount_begin(&stats->lock);
>>+		total = __execlists_engine_busyness(engine, now);
>>+	} while (read_seqcount_retry(&stats->lock, seq));
>>+
>>+	return total;
>>+}
>>+
>>  static void
>>  logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>>  {
>>@@ -3348,6 +3380,8 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>>  		engine->emit_bb_start = gen8_emit_bb_start;
>>  	else
>>  		engine->emit_bb_start = gen8_emit_bb_start_noarb;
>>+
>>+	engine->busyness = execlists_engine_busyness;
>>  }
>>  static void logical_ring_default_irqs(struct intel_engine_cs *engine)
>>diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
>>index 524eaf678790..b4a8594bc46c 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
>>+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
>>@@ -86,6 +86,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
>>  	intel_rc6_unpark(&gt->rc6);
>>  	intel_rps_unpark(&gt->rps);
>>  	i915_pmu_gt_unparked(i915);
>>+	intel_guc_busyness_unpark(gt);
>>  	intel_gt_unpark_requests(gt);
>>  	runtime_begin(gt);
>>@@ -104,6 +105,7 @@ static int __gt_park(struct intel_wakeref *wf)
>>  	runtime_end(gt);
>>  	intel_gt_park_requests(gt);
>>+	intel_guc_busyness_park(gt);
>>  	i915_vma_parked(gt);
>>  	i915_pmu_gt_parked(i915);
>>  	intel_rps_park(&gt->rps);
>>diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
>>index 91200c43951f..ac12163c3639 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_reset.c
>>+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
>>@@ -1389,6 +1389,22 @@ void intel_gt_handle_error(struct intel_gt *gt,
>>  	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
>>  }
>>+bool intel_gt_reset_trylock_no_wait(struct intel_gt *gt, int *srcu)
>>+{
>>+	int reset_in_progress;
>>+
>>+	might_lock(&gt->reset.backoff_srcu);
>>+	cant_sleep();
>>+
>>+	rcu_read_lock();
>>+	reset_in_progress = test_bit(I915_RESET_BACKOFF, &gt->reset.flags);
>>+	if (!reset_in_progress)
>>+		*srcu = srcu_read_lock(&gt->reset.backoff_srcu);
>>+	rcu_read_unlock();
>>+
>>+	return reset_in_progress;
>>+}
>>+
>>  int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
>>  {
>>  	might_lock(&gt->reset.backoff_srcu);
>>diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h b/drivers/gpu/drm/i915/gt/intel_reset.h
>>index adc734e67387..4f5f4c00c54f 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_reset.h
>>+++ b/drivers/gpu/drm/i915/gt/intel_reset.h
>>@@ -38,6 +38,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs *engine,
>>  void __i915_request_reset(struct i915_request *rq, bool guilty);
>>+bool __must_check intel_gt_reset_trylock_no_wait(struct intel_gt *gt, int *srcu);
>>  int __must_check intel_gt_reset_trylock(struct intel_gt *gt, int *srcu);
>>  void intel_gt_reset_unlock(struct intel_gt *gt, int tag);
>>diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>>index 8ff582222aff..ff1311d4beff 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>>+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>>@@ -143,6 +143,7 @@ enum intel_guc_action {
>>  	INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506,
>>  	INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
>>  	INTEL_GUC_ACTION_RESET_CLIENT = 0x5507,
>>+	INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
>>  	INTEL_GUC_ACTION_LIMIT
>>  };
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>index 5dd174babf7a..3c3d48c7d5de 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>@@ -104,6 +104,8 @@ struct intel_guc {
>>  	u32 ads_regset_size;
>>  	/** @ads_golden_ctxt_size: size of the golden contexts in the ADS */
>>  	u32 ads_golden_ctxt_size;
>>+	/** @ads_engine_usage_size: size of engine usage in the ADS */
>>+	u32 ads_engine_usage_size;
>>  	/** @lrc_desc_pool: object allocated to hold the GuC LRC descriptor pool */
>>  	struct i915_vma *lrc_desc_pool;
>>@@ -138,6 +140,34 @@ struct intel_guc {
>>  	/** @send_mutex: used to serialize the intel_guc_send actions */
>>  	struct mutex send_mutex;
>>+
>>+	/**
>>+	 * @timestamp: GT timestamp object that stores a copy of the timestamp
>>+	 * and adjusts it for overflow using a worker.
>>+	 */
>>+	struct {
>>+		/**
>>+		 * @lock: Lock protecting the below fields and the engine stats.
>>+		 */
>>+		spinlock_t lock;
>>+
>>+		/**
>>+		 * @gt_stamp: 64 bit extended value of the GT timestamp.
>>+		 */
>>+		u64 gt_stamp;
>>+
>>+		/**
>>+		 * @ping_delay: Period for polling the GT timestamp for
>>+		 * overflow.
>>+		 */
>>+		unsigned long ping_delay;
>>+
>>+		/**
>>+		 * @work: Periodic work to adjust GT timestamp, engine and
>>+		 * context usage for overflows.
>>+		 */
>>+		struct delayed_work work;
>>+	} timestamp;
>>  };
>>  static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>>index 2c6ea64af7ec..ca9ab53999d5 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>>@@ -26,6 +26,8 @@
>>   *      | guc_policies                          |
>>   *      +---------------------------------------+
>>   *      | guc_gt_system_info                    |
>>+ *      +---------------------------------------+
>>+ *      | guc_engine_usage                      |
>>   *      +---------------------------------------+ <== static
>>   *      | guc_mmio_reg[countA] (engine 0.0)     |
>>   *      | guc_mmio_reg[countB] (engine 0.1)     |
>>@@ -47,6 +49,7 @@ struct __guc_ads_blob {
>>  	struct guc_ads ads;
>>  	struct guc_policies policies;
>>  	struct guc_gt_system_info system_info;
>>+	struct guc_engine_usage engine_usage;
>>  	/* From here on, location is dynamic! Refer to above diagram. */
>>  	struct guc_mmio_reg regset[0];
>>  } __packed;
>>@@ -628,3 +631,21 @@ void intel_guc_ads_reset(struct intel_guc *guc)
>>  	guc_ads_private_data_reset(guc);
>>  }
>>+
>>+u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
>>+{
>>+	struct __guc_ads_blob *blob = guc->ads_blob;
>>+	u32 base = intel_guc_ggtt_offset(guc, guc->ads_vma);
>>+	u32 offset = base + ptr_offset(blob, engine_usage);
>>+
>>+	return offset;
>>+}
>>+
>>+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine)
>>+{
>>+	struct intel_guc *guc = &engine->gt->uc.guc;
>>+	struct __guc_ads_blob *blob = guc->ads_blob;
>>+	u8 guc_class = engine_class_to_guc_class(engine->class);
>>+
>>+	return &blob->engine_usage.engines[guc_class][engine->instance];
>>+}
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
>>index 3d85051d57e4..e74c110facff 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
>>@@ -6,8 +6,11 @@
>>  #ifndef _INTEL_GUC_ADS_H_
>>  #define _INTEL_GUC_ADS_H_
>>+#include <linux/types.h>
>>+
>>  struct intel_guc;
>>  struct drm_printer;
>>+struct intel_engine_cs;
>>  int intel_guc_ads_create(struct intel_guc *guc);
>>  void intel_guc_ads_destroy(struct intel_guc *guc);
>>@@ -15,5 +18,7 @@ void intel_guc_ads_init_late(struct intel_guc *guc);
>>  void intel_guc_ads_reset(struct intel_guc *guc);
>>  void intel_guc_ads_print_policy_info(struct intel_guc *guc,
>>  				     struct drm_printer *p);
>>+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine);
>>+u32 intel_guc_engine_usage_offset(struct intel_guc *guc);
>>  #endif
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>index fa4be13c8854..7c9c081670fc 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>@@ -294,6 +294,19 @@ struct guc_ads {
>>  	u32 reserved[15];
>>  } __packed;
>>+/* Engine usage stats */
>>+struct guc_engine_usage_record {
>>+	u32 current_context_index;
>>+	u32 last_switch_in_stamp;
>>+	u32 reserved0;
>>+	u32 total_runtime;
>>+	u32 reserved1[4];
>>+} __packed;
>>+
>>+struct guc_engine_usage {
>>+	struct guc_engine_usage_record engines[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
>>+} __packed;
>>+
>>  /* GuC logging structures */
>>  enum guc_log_buffer_type {
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>index ba0de35f6323..0c2e4d8d8ec3 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>@@ -12,6 +12,7 @@
>>  #include "gt/intel_engine_pm.h"
>>  #include "gt/intel_engine_heartbeat.h"
>>  #include "gt/intel_gt.h"
>>+#include "gt/intel_gt_clock_utils.h"
>>  #include "gt/intel_gt_irq.h"
>>  #include "gt/intel_gt_pm.h"
>>  #include "gt/intel_gt_requests.h"
>>@@ -20,6 +21,7 @@
>>  #include "gt/intel_mocs.h"
>>  #include "gt/intel_ring.h"
>>+#include "intel_guc_ads.h"
>>  #include "intel_guc_submission.h"
>>  #include "i915_drv.h"
>>@@ -750,6 +752,262 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
>>  	xa_unlock_irqrestore(&guc->context_lookup, flags);
>>  }
>>+/*
>>+ * GuC stores busyness stats for each engine at context in/out boundaries. A
>>+ * context 'in' logs execution start time, 'out' adds in -> out delta to total.
>>+ * i915/kmd accesses 'start', 'total' and 'context id' from memory shared with
>>+ * GuC.
>>+ *
>>+ * __i915_pmu_event_read samples engine busyness. When sampling, if context id
>>+ * is valid (!= ~0) and start is non-zero, the engine is considered to be
>>+ * active. For an active engine total busyness = total + (now - start), where
>>+ * 'now' is the time at which the busyness is sampled. For inactive engine,
>>+ * total busyness = total.
>>+ *
>>+ * All times are captured from GUCPMTIMESTAMP reg and are in gt clock domain.
>>+ *
>>+ * The start and total values provided by GuC are 32 bits and wrap around in a
>>+ * few minutes. Since perf pmu provides busyness as 64 bit monotonically
>>+ * increasing ns values, there is a need for this implementation to account for
>>+ * overflows and extend the GuC provided values to 64 bits before returning
>>+ * busyness to the user. In order to do that, a worker runs periodically at
>>+ * frequency = 1/8th the time it takes for the timestamp to wrap (i.e. once in
>>+ * 27 seconds for a gt clock frequency of 19.2 MHz).
>>+ */
>>+
>>+#define WRAP_TIME_CLKS U32_MAX
>>+#define POLL_TIME_CLKS (WRAP_TIME_CLKS >> 3)
>>+
>>+static void
>>+__extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start)
>>+{
>>+	u32 gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
>>+	u32 gt_stamp_last = lower_32_bits(guc->timestamp.gt_stamp);
>>+
>>+	if (new_start == lower_32_bits(*prev_start))
>>+		return;
>>+
>>+	if (new_start < gt_stamp_last &&
>>+	    (new_start - gt_stamp_last) <= POLL_TIME_CLKS)
>>+		gt_stamp_hi++;
>>+
>>+	if (new_start > gt_stamp_last &&
>>+	    (gt_stamp_last - new_start) <= POLL_TIME_CLKS && gt_stamp_hi)
>>+		gt_stamp_hi--;
>>+
>>+	*prev_start = ((u64)gt_stamp_hi << 32) | new_start;
>>+}
>>+
>>+static void guc_update_engine_gt_clks(struct intel_engine_cs *engine)
>>+{
>>+	struct guc_engine_usage_record *rec = intel_guc_engine_usage(engine);
>>+	struct intel_engine_guc_stats *stats = &engine->stats.guc;
>>+	struct intel_guc *guc = &engine->gt->uc.guc;
>>+	u32 last_switch = rec->last_switch_in_stamp;
>>+	u32 ctx_id = rec->current_context_index;
>>+	u32 total = rec->total_runtime;
>>+
>>+	lockdep_assert_held(&guc->timestamp.lock);
>>+
>>+	stats->running = ctx_id != ~0U && last_switch;
>>+	if (stats->running)
>>+		__extend_last_switch(guc, &stats->start_gt_clk, last_switch);
>>+
>>+	/*
>>+	 * Instead of adjusting the total for overflow, just add the
>>+	 * difference from previous sample stats->total_gt_clks
>>+	 */
>>+	if (total && total != ~0U) {
>>+		stats->total_gt_clks += (u32)(total - stats->prev_total);
>>+		stats->prev_total = total;
>>+	}
>>+}
>>+
>>+static void guc_update_pm_timestamp(struct intel_guc *guc)
>>+{
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>+	u32 gt_stamp_now, gt_stamp_hi;
>>+
>>+	lockdep_assert_held(&guc->timestamp.lock);
>>+
>>+	gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
>>+	gt_stamp_now = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
>>+
>>+	if (gt_stamp_now < lower_32_bits(guc->timestamp.gt_stamp))
>>+		gt_stamp_hi++;
>>+
>>+	guc->timestamp.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_now;
>>+}
>>+
>>+/*
>>+ * Unlike the execlist mode of submission total and active times are in terms of
>>+ * gt clocks. The *now parameter is retained to return the cpu time at which the
>>+ * busyness was sampled.
>>+ */
>>+static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
>>+{
>>+	struct intel_engine_guc_stats *stats = &engine->stats.guc;
>>+	struct intel_gt *gt = engine->gt;
>>+	struct intel_guc *guc = &gt->uc.guc;
>>+	unsigned long flags;
>>+	bool reset_in_progress;
>>+	u64 total;
>>+	int srcu;
>>+
>>+	/*
>>+	 * If a reset is in progress, we risk reading partially updated
>>+	 * engine busyness from GuC, so we just use the driver stored
>>+	 * copy of busyness. Synchronize with gt reset lock to achieve
>>+	 * this.
>>+	 */
>>+	reset_in_progress = intel_gt_reset_trylock_no_wait(gt, &srcu);
>>+
>>+	/*
>>+	 * The order of taking the reset lock first and then the
>>+	 * timestamp lock is intentional to avoid lock inversion related
>>+	 * issues.
>>+	 */
>>+	spin_lock_irqsave(&guc->timestamp.lock, flags);
>>+
>>+	*now = ktime_get();
>>+
>>+	/*
>>+	 * The active busyness depends on start_gt_clk and gt_stamp.
>>+	 * gt_stamp is updated by i915 only when gt is awake and the
>>+	 * start_gt_clk is derived from GuC state. To get a consistent
>>+	 * view of activity, we query the GuC state only if gt is awake.
>>+	 */
>>+	if (intel_gt_pm_get_if_awake(gt) && !reset_in_progress) {
>>+		guc_update_engine_gt_clks(engine);
>>+		guc_update_pm_timestamp(guc);
>>+		intel_gt_pm_put_async(gt);
>>+	}
>>+
>>+	total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks);
>>+	if (stats->running) {
>>+		u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk;
>>+
>>+		total += intel_gt_clock_interval_to_ns(gt, clk);
>>+	}
>>+
>>+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>+	if (!reset_in_progress)
>>+		intel_gt_reset_unlock(gt, srcu);
>>+
>>+	return ns_to_ktime(total);
>>+}
>>+
>>+static void __reset_guc_busyness_stats(struct intel_guc *guc)
>>+{
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>+	struct intel_engine_cs *engine;
>>+	enum intel_engine_id id;
>>+	unsigned long flags;
>>+
>>+	cancel_delayed_work_sync(&guc->timestamp.work);
>>+
>>+	spin_lock_irqsave(&guc->timestamp.lock, flags);
>>+
>>+	guc_update_pm_timestamp(guc);
>>+	for_each_engine(engine, gt, id) {
>>+		guc_update_engine_gt_clks(engine);
>>+		engine->stats.guc.prev_total = 0;
>>+	}
>>+
>>+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>+}
>>+
>>+static void __update_guc_busyness_stats(struct intel_guc *guc)
>>+{
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>+	struct intel_engine_cs *engine;
>>+	enum intel_engine_id id;
>>+
>>+	guc_update_pm_timestamp(guc);
>>+	for_each_engine(engine, gt, id)
>>+		guc_update_engine_gt_clks(engine);
>>+}
>>+
>>+static void guc_timestamp_ping(struct work_struct *wrk)
>>+{
>>+	struct intel_guc *guc = container_of(wrk, typeof(*guc),
>>+					     timestamp.work.work);
>>+	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>+	intel_wakeref_t wakeref;
>>+	unsigned long flags;
>>+	int srcu, ret;
>>+
>>+	/*
>>+	 * Synchronize with gt reset to make sure the worker does not
>>+	 * corrupt the engine/guc stats.
>>+	 */
>>+	ret = intel_gt_reset_trylock(gt, &srcu);
>>+	if (ret)
>>+		return;
>>+
>>+	spin_lock_irqsave(&guc->timestamp.lock, flags);
>>+
>>+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>>+		__update_guc_busyness_stats(guc);
>>+
>>+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>+
>>+	intel_gt_reset_unlock(gt, srcu);
>>+
>>+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>+			 guc->timestamp.ping_delay);
>>+}
>>+
>>+static int guc_action_enable_usage_stats(struct intel_guc *guc)
>>+{
>>+	u32 offset = intel_guc_engine_usage_offset(guc);
>>+	u32 action[] = {
>>+		INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF,
>>+		offset,
>>+		0,
>>+	};
>>+
>>+	return intel_guc_send(guc, action, ARRAY_SIZE(action));
>>+}
>>+
>>+static void guc_init_engine_stats(struct intel_guc *guc)
>>+{
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>+	intel_wakeref_t wakeref;
>>+
>>+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>+			 guc->timestamp.ping_delay);
>>+
>>+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref) {
>>+		int ret = guc_action_enable_usage_stats(guc);
>>+
>>+		if (ret)
>>+			drm_err(&gt->i915->drm,
>>+				"Failed to enable usage stats: %d!\n", ret);
>>+	}
>>+}
>>+
>>+void intel_guc_busyness_park(struct intel_gt *gt)
>>+{
>>+	struct intel_guc *guc = &gt->uc.guc;
>>+	unsigned long flags;
>>+
>>+	cancel_delayed_work(&guc->timestamp.work);
>>+
>>+	spin_lock_irqsave(&guc->timestamp.lock, flags);
>>+	__update_guc_busyness_stats(guc);
>>+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>+}
>>+
>>+void intel_guc_busyness_unpark(struct intel_gt *gt)
>>+{
>>+	struct intel_guc *guc = &gt->uc.guc;
>>+
>>+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>+			 guc->timestamp.ping_delay);
>
>You will need some sort of "is guc in use" check here.

Correct, need that for CI machines not booting.

Thanks,
Umesh

>Regards,
>
>Tvrtko

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-13  0:56 ` [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu Umesh Nerlige Ramappa
  2021-10-13 16:06   ` Tvrtko Ursulin
@ 2021-10-14  8:21   ` Tvrtko Ursulin
  2021-10-15  1:01     ` Umesh Nerlige Ramappa
  1 sibling, 1 reply; 31+ messages in thread
From: Tvrtko Ursulin @ 2021-10-14  8:21 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa, intel-gfx, dri-devel
  Cc: john.c.harrison, daniel.vetter, Matthew Brost


On 13/10/2021 01:56, Umesh Nerlige Ramappa wrote:
> With GuC handling scheduling, i915 is not aware of the time that a
> context is scheduled in and out of the engine. Since i915 pmu relies on
> this info to provide engine busyness to the user, GuC shares this info
> with i915 for all engines using shared memory. For each engine, this
> info contains:
> 
> - total busyness: total time that the context was running (total)
> - id: id of the running context (id)
> - start timestamp: timestamp when the context started running (start)
> 
> At the time (now) of sampling the engine busyness, if the id is valid
> (!= ~0), and start is non-zero, then the context is considered to be
> active and the engine busyness is calculated using the below equation
> 
> 	engine busyness = total + (now - start)
> 
> All times are obtained from the gt clock base. For inactive contexts,
> engine busyness is just equal to the total.
> 
> The start and total values provided by GuC are 32 bits and wrap around
> in a few minutes. Since perf pmu provides busyness as 64 bit
> monotonically increasing values, there is a need for this implementation
> to account for overflows and extend the time to 64 bits before returning
> busyness to the user. In order to do that, a worker runs periodically at
> frequency = 1/8th the time it takes for the timestamp to wrap. As an
> example, that would be once in 27 seconds for a gt clock frequency of
> 19.2 MHz.
> 
> Note:
> There might be an overaccounting of busyness due to the fact that GuC
> may be updating the total and start values while kmd is reading them.
> (i.e kmd may read the updated total and the stale start). In such a
> case, user may see higher busyness value followed by smaller ones which
> would eventually catch up to the higher value.
> 
> v2: (Tvrtko)
> - Include details in commit message
> - Move intel engine busyness function into execlist code
> - Use union inside engine->stats
> - Use natural type for ping delay jiffies
> - Drop active_work condition checks
> - Use for_each_engine if iterating all engines
> - Drop seq locking, use spinlock at guc level to update engine stats
> - Document worker specific details
> 
> v3: (Tvrtko/Umesh)
> - Demarcate guc and execlist stat objects with comments
> - Document known over-accounting issue in commit
> - Provide a consistent view of guc state
> - Add hooks to gt park/unpark for guc busyness
> - Stop/start worker in gt park/unpark path
> - Drop inline
> - Move spinlock and worker inits to guc initialization
> - Drop helpers that are called only once
> 
> v4: (Tvrtko/Matt/Umesh)
> - Drop addressed opens from commit message
> - Get runtime pm in ping, remove from the park path
> - Use cancel_delayed_work_sync in disable_submission path
> - Update stats during reset prepare
> - Skip ping if reset in progress
> - Explicitly name execlists and guc stats objects
> - Since disable_submission is called from many places, move resetting
>    stats to intel_guc_submission_reset_prepare
> 
> v5: (Tvrtko)
> - Add a trylock helper that does not sleep and synchronize PMU event
>    callbacks and worker with gt reset
> 
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  28 +-
>   drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
>   .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
>   drivers/gpu/drm/i915/gt/intel_gt_pm.c         |   2 +
>   drivers/gpu/drm/i915/gt/intel_reset.c         |  16 ++
>   drivers/gpu/drm/i915/gt/intel_reset.h         |   1 +
>   .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
>   drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  30 ++
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  21 ++
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   5 +
>   drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 267 ++++++++++++++++++
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
>   drivers/gpu/drm/i915/i915_reg.h               |   2 +
>   14 files changed, 427 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 38436f4b5706..6b783fdcba2a 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>   	intel_engine_print_breadcrumbs(engine, m);
>   }
>   
> -static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
> -					    ktime_t *now)
> -{
> -	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
> -	ktime_t total = stats->total;
> -
> -	/*
> -	 * If the engine is executing something at the moment
> -	 * add it to the total.
> -	 */
> -	*now = ktime_get();
> -	if (READ_ONCE(stats->active))
> -		total = ktime_add(total, ktime_sub(*now, stats->start));
> -
> -	return total;
> -}
> -
>   /**
>    * intel_engine_get_busy_time() - Return current accumulated engine busyness
>    * @engine: engine to report on
> @@ -1899,16 +1882,7 @@ static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
>    */
>   ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now)
>   {
> -	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
> -	unsigned int seq;
> -	ktime_t total;
> -
> -	do {
> -		seq = read_seqcount_begin(&stats->lock);
> -		total = __intel_engine_get_busy_time(engine, now);
> -	} while (read_seqcount_retry(&stats->lock, seq));
> -
> -	return total;
> +	return engine->busyness(engine, now);
>   }
>   
>   struct intel_context *
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index b820a2c1124e..9300c65d6675 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -284,6 +284,28 @@ struct intel_engine_execlists_stats {
>   	ktime_t start;
>   };
>   
> +struct intel_engine_guc_stats {
> +	/**
> +	 * @running: Active state of the engine when busyness was last sampled.
> +	 */
> +	bool running;
> +
> +	/**
> +	 * @prev_total: Previous value of total runtime clock cycles.
> +	 */
> +	u32 prev_total;
> +
> +	/**
> +	 * @total_gt_clks: Total gt clock cycles this engine was busy.
> +	 */
> +	u64 total_gt_clks;
> +
> +	/**
> +	 * @start_gt_clk: GT clock time of last idle to active transition.
> +	 */
> +	u64 start_gt_clk;
> +};
> +
>   struct intel_engine_cs {
>   	struct drm_i915_private *i915;
>   	struct intel_gt *gt;
> @@ -459,6 +481,12 @@ struct intel_engine_cs {
>   	void		(*add_active_request)(struct i915_request *rq);
>   	void		(*remove_active_request)(struct i915_request *rq);
>   
> +	/*
> +	 * Get engine busyness and the time at which the busyness was sampled.
> +	 */
> +	ktime_t		(*busyness)(struct intel_engine_cs *engine,
> +				    ktime_t *now);
> +
>   	struct intel_engine_execlists execlists;
>   
>   	/*
> @@ -508,7 +536,10 @@ struct intel_engine_cs {
>   	u32 (*get_cmd_length_mask)(u32 cmd_header);
>   
>   	struct {
> -		struct intel_engine_execlists_stats execlists;
> +		union {
> +			struct intel_engine_execlists_stats execlists;
> +			struct intel_engine_guc_stats guc;
> +		};
>   
>   		/**
>   		 * @rps: Utilisation at last RPS sampling.
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 73a79c2acd3a..e8ffcf36f6f4 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -3292,6 +3292,38 @@ static void execlists_release(struct intel_engine_cs *engine)
>   	lrc_fini_wa_ctx(engine);
>   }
>   
> +static ktime_t __execlists_engine_busyness(struct intel_engine_cs *engine,
> +					   ktime_t *now)
> +{
> +	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
> +	ktime_t total = stats->total;
> +
> +	/*
> +	 * If the engine is executing something at the moment
> +	 * add it to the total.
> +	 */
> +	*now = ktime_get();
> +	if (READ_ONCE(stats->active))
> +		total = ktime_add(total, ktime_sub(*now, stats->start));
> +
> +	return total;
> +}
> +
> +static ktime_t execlists_engine_busyness(struct intel_engine_cs *engine,
> +					 ktime_t *now)
> +{
> +	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
> +	unsigned int seq;
> +	ktime_t total;
> +
> +	do {
> +		seq = read_seqcount_begin(&stats->lock);
> +		total = __execlists_engine_busyness(engine, now);
> +	} while (read_seqcount_retry(&stats->lock, seq));
> +
> +	return total;
> +}
> +
>   static void
>   logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>   {
> @@ -3348,6 +3380,8 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>   		engine->emit_bb_start = gen8_emit_bb_start;
>   	else
>   		engine->emit_bb_start = gen8_emit_bb_start_noarb;
> +
> +	engine->busyness = execlists_engine_busyness;
>   }
>   
>   static void logical_ring_default_irqs(struct intel_engine_cs *engine)
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> index 524eaf678790..b4a8594bc46c 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> @@ -86,6 +86,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
>   	intel_rc6_unpark(&gt->rc6);
>   	intel_rps_unpark(&gt->rps);
>   	i915_pmu_gt_unparked(i915);
> +	intel_guc_busyness_unpark(gt);
>   
>   	intel_gt_unpark_requests(gt);
>   	runtime_begin(gt);
> @@ -104,6 +105,7 @@ static int __gt_park(struct intel_wakeref *wf)
>   	runtime_end(gt);
>   	intel_gt_park_requests(gt);
>   
> +	intel_guc_busyness_park(gt);
>   	i915_vma_parked(gt);
>   	i915_pmu_gt_parked(i915);
>   	intel_rps_park(&gt->rps);
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index 91200c43951f..ac12163c3639 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -1389,6 +1389,22 @@ void intel_gt_handle_error(struct intel_gt *gt,
>   	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
>   }
>   
> +bool intel_gt_reset_trylock_no_wait(struct intel_gt *gt, int *srcu)
> +{
> +	int reset_in_progress;
> +
> +	might_lock(&gt->reset.backoff_srcu);
> +	cant_sleep();
> +
> +	rcu_read_lock();
> +	reset_in_progress = test_bit(I915_RESET_BACKOFF, &gt->reset.flags);
> +	if (!reset_in_progress)
> +		*srcu = srcu_read_lock(&gt->reset.backoff_srcu);
> +	rcu_read_unlock();
> +
> +	return reset_in_progress;
> +}
> +
>   int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
>   {
>   	might_lock(&gt->reset.backoff_srcu);
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h b/drivers/gpu/drm/i915/gt/intel_reset.h
> index adc734e67387..4f5f4c00c54f 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.h
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.h
> @@ -38,6 +38,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs *engine,
>   
>   void __i915_request_reset(struct i915_request *rq, bool guilty);
>   
> +bool __must_check intel_gt_reset_trylock_no_wait(struct intel_gt *gt, int *srcu);
>   int __must_check intel_gt_reset_trylock(struct intel_gt *gt, int *srcu);
>   void intel_gt_reset_unlock(struct intel_gt *gt, int tag);
>   
> diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
> index 8ff582222aff..ff1311d4beff 100644
> --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
> +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
> @@ -143,6 +143,7 @@ enum intel_guc_action {
>   	INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506,
>   	INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
>   	INTEL_GUC_ACTION_RESET_CLIENT = 0x5507,
> +	INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
>   	INTEL_GUC_ACTION_LIMIT
>   };
>   
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> index 5dd174babf7a..3c3d48c7d5de 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> @@ -104,6 +104,8 @@ struct intel_guc {
>   	u32 ads_regset_size;
>   	/** @ads_golden_ctxt_size: size of the golden contexts in the ADS */
>   	u32 ads_golden_ctxt_size;
> +	/** @ads_engine_usage_size: size of engine usage in the ADS */
> +	u32 ads_engine_usage_size;
>   
>   	/** @lrc_desc_pool: object allocated to hold the GuC LRC descriptor pool */
>   	struct i915_vma *lrc_desc_pool;
> @@ -138,6 +140,34 @@ struct intel_guc {
>   
>   	/** @send_mutex: used to serialize the intel_guc_send actions */
>   	struct mutex send_mutex;
> +
> +	/**
> +	 * @timestamp: GT timestamp object that stores a copy of the timestamp
> +	 * and adjusts it for overflow using a worker.
> +	 */
> +	struct {
> +		/**
> +		 * @lock: Lock protecting the below fields and the engine stats.
> +		 */
> +		spinlock_t lock;
> +
> +		/**
> +		 * @gt_stamp: 64 bit extended value of the GT timestamp.
> +		 */
> +		u64 gt_stamp;
> +
> +		/**
> +		 * @ping_delay: Period for polling the GT timestamp for
> +		 * overflow.
> +		 */
> +		unsigned long ping_delay;
> +
> +		/**
> +		 * @work: Periodic work to adjust GT timestamp, engine and
> +		 * context usage for overflows.
> +		 */
> +		struct delayed_work work;
> +	} timestamp;
>   };
>   
>   static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
> index 2c6ea64af7ec..ca9ab53999d5 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
> @@ -26,6 +26,8 @@
>    *      | guc_policies                          |
>    *      +---------------------------------------+
>    *      | guc_gt_system_info                    |
> + *      +---------------------------------------+
> + *      | guc_engine_usage                      |
>    *      +---------------------------------------+ <== static
>    *      | guc_mmio_reg[countA] (engine 0.0)     |
>    *      | guc_mmio_reg[countB] (engine 0.1)     |
> @@ -47,6 +49,7 @@ struct __guc_ads_blob {
>   	struct guc_ads ads;
>   	struct guc_policies policies;
>   	struct guc_gt_system_info system_info;
> +	struct guc_engine_usage engine_usage;
>   	/* From here on, location is dynamic! Refer to above diagram. */
>   	struct guc_mmio_reg regset[0];
>   } __packed;
> @@ -628,3 +631,21 @@ void intel_guc_ads_reset(struct intel_guc *guc)
>   
>   	guc_ads_private_data_reset(guc);
>   }
> +
> +u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
> +{
> +	struct __guc_ads_blob *blob = guc->ads_blob;
> +	u32 base = intel_guc_ggtt_offset(guc, guc->ads_vma);
> +	u32 offset = base + ptr_offset(blob, engine_usage);
> +
> +	return offset;
> +}
> +
> +struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine)
> +{
> +	struct intel_guc *guc = &engine->gt->uc.guc;
> +	struct __guc_ads_blob *blob = guc->ads_blob;
> +	u8 guc_class = engine_class_to_guc_class(engine->class);
> +
> +	return &blob->engine_usage.engines[guc_class][engine->instance];
> +}
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
> index 3d85051d57e4..e74c110facff 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
> @@ -6,8 +6,11 @@
>   #ifndef _INTEL_GUC_ADS_H_
>   #define _INTEL_GUC_ADS_H_
>   
> +#include <linux/types.h>
> +
>   struct intel_guc;
>   struct drm_printer;
> +struct intel_engine_cs;
>   
>   int intel_guc_ads_create(struct intel_guc *guc);
>   void intel_guc_ads_destroy(struct intel_guc *guc);
> @@ -15,5 +18,7 @@ void intel_guc_ads_init_late(struct intel_guc *guc);
>   void intel_guc_ads_reset(struct intel_guc *guc);
>   void intel_guc_ads_print_policy_info(struct intel_guc *guc,
>   				     struct drm_printer *p);
> +struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine);
> +u32 intel_guc_engine_usage_offset(struct intel_guc *guc);
>   
>   #endif
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> index fa4be13c8854..7c9c081670fc 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> @@ -294,6 +294,19 @@ struct guc_ads {
>   	u32 reserved[15];
>   } __packed;
>   
> +/* Engine usage stats */
> +struct guc_engine_usage_record {
> +	u32 current_context_index;
> +	u32 last_switch_in_stamp;
> +	u32 reserved0;
> +	u32 total_runtime;
> +	u32 reserved1[4];
> +} __packed;
> +
> +struct guc_engine_usage {
> +	struct guc_engine_usage_record engines[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
> +} __packed;
> +
>   /* GuC logging structures */
>   
>   enum guc_log_buffer_type {
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index ba0de35f6323..0c2e4d8d8ec3 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -12,6 +12,7 @@
>   #include "gt/intel_engine_pm.h"
>   #include "gt/intel_engine_heartbeat.h"
>   #include "gt/intel_gt.h"
> +#include "gt/intel_gt_clock_utils.h"
>   #include "gt/intel_gt_irq.h"
>   #include "gt/intel_gt_pm.h"
>   #include "gt/intel_gt_requests.h"
> @@ -20,6 +21,7 @@
>   #include "gt/intel_mocs.h"
>   #include "gt/intel_ring.h"
>   
> +#include "intel_guc_ads.h"
>   #include "intel_guc_submission.h"
>   
>   #include "i915_drv.h"
> @@ -750,6 +752,262 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
>   	xa_unlock_irqrestore(&guc->context_lookup, flags);
>   }
>   
> +/*
> + * GuC stores busyness stats for each engine at context in/out boundaries. A
> + * context 'in' logs execution start time, 'out' adds in -> out delta to total.
> + * i915/kmd accesses 'start', 'total' and 'context id' from memory shared with
> + * GuC.
> + *
> + * __i915_pmu_event_read samples engine busyness. When sampling, if context id
> + * is valid (!= ~0) and start is non-zero, the engine is considered to be
> + * active. For an active engine total busyness = total + (now - start), where
> + * 'now' is the time at which the busyness is sampled. For inactive engine,
> + * total busyness = total.
> + *
> + * All times are captured from GUCPMTIMESTAMP reg and are in gt clock domain.
> + *
> + * The start and total values provided by GuC are 32 bits and wrap around in a
> + * few minutes. Since perf pmu provides busyness as 64 bit monotonically
> + * increasing ns values, there is a need for this implementation to account for
> + * overflows and extend the GuC provided values to 64 bits before returning
> + * busyness to the user. In order to do that, a worker runs periodically at
> + * frequency = 1/8th the time it takes for the timestamp to wrap (i.e. once in
> + * 27 seconds for a gt clock frequency of 19.2 MHz).
> + */
> +
> +#define WRAP_TIME_CLKS U32_MAX
> +#define POLL_TIME_CLKS (WRAP_TIME_CLKS >> 3)
> +
> +static void
> +__extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start)
> +{
> +	u32 gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
> +	u32 gt_stamp_last = lower_32_bits(guc->timestamp.gt_stamp);
> +
> +	if (new_start == lower_32_bits(*prev_start))
> +		return;
> +
> +	if (new_start < gt_stamp_last &&
> +	    (new_start - gt_stamp_last) <= POLL_TIME_CLKS)
> +		gt_stamp_hi++;
> +
> +	if (new_start > gt_stamp_last &&
> +	    (gt_stamp_last - new_start) <= POLL_TIME_CLKS && gt_stamp_hi)
> +		gt_stamp_hi--;
> +
> +	*prev_start = ((u64)gt_stamp_hi << 32) | new_start;
> +}
> +
> +static void guc_update_engine_gt_clks(struct intel_engine_cs *engine)
> +{
> +	struct guc_engine_usage_record *rec = intel_guc_engine_usage(engine);
> +	struct intel_engine_guc_stats *stats = &engine->stats.guc;
> +	struct intel_guc *guc = &engine->gt->uc.guc;
> +	u32 last_switch = rec->last_switch_in_stamp;
> +	u32 ctx_id = rec->current_context_index;
> +	u32 total = rec->total_runtime;
> +
> +	lockdep_assert_held(&guc->timestamp.lock);
> +
> +	stats->running = ctx_id != ~0U && last_switch;
> +	if (stats->running)
> +		__extend_last_switch(guc, &stats->start_gt_clk, last_switch);
> +
> +	/*
> +	 * Instead of adjusting the total for overflow, just add the
> +	 * difference from previous sample stats->total_gt_clks
> +	 */
> +	if (total && total != ~0U) {
> +		stats->total_gt_clks += (u32)(total - stats->prev_total);
> +		stats->prev_total = total;
> +	}
> +}
> +
> +static void guc_update_pm_timestamp(struct intel_guc *guc)
> +{
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	u32 gt_stamp_now, gt_stamp_hi;
> +
> +	lockdep_assert_held(&guc->timestamp.lock);
> +
> +	gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
> +	gt_stamp_now = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
> +
> +	if (gt_stamp_now < lower_32_bits(guc->timestamp.gt_stamp))
> +		gt_stamp_hi++;
> +
> +	guc->timestamp.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_now;
> +}
> +
> +/*
> + * Unlike the execlist mode of submission total and active times are in terms of
> + * gt clocks. The *now parameter is retained to return the cpu time at which the
> + * busyness was sampled.
> + */
> +static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
> +{
> +	struct intel_engine_guc_stats *stats = &engine->stats.guc;
> +	struct intel_gt *gt = engine->gt;
> +	struct intel_guc *guc = &gt->uc.guc;
> +	unsigned long flags;
> +	bool reset_in_progress;
> +	u64 total;
> +	int srcu;
> +
> +	/*
> +	 * If a reset is in progress, we risk reading partially updated
> +	 * engine busyness from GuC, so we just use the driver stored
> +	 * copy of busyness. Synchronize with gt reset lock to achieve
> +	 * this.
> +	 */
> +	reset_in_progress = intel_gt_reset_trylock_no_wait(gt, &srcu);
> +
> +	/*
> +	 * The order of taking the reset lock first and then the
> +	 * timestamp lock is intentional to avoid lock inversion related
> +	 * issues.
> +	 */
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	*now = ktime_get();
> +
> +	/*
> +	 * The active busyness depends on start_gt_clk and gt_stamp.
> +	 * gt_stamp is updated by i915 only when gt is awake and the
> +	 * start_gt_clk is derived from GuC state. To get a consistent
> +	 * view of activity, we query the GuC state only if gt is awake.
> +	 */
> +	if (intel_gt_pm_get_if_awake(gt) && !reset_in_progress) {
> +		guc_update_engine_gt_clks(engine);
> +		guc_update_pm_timestamp(guc);
> +		intel_gt_pm_put_async(gt);
> +	}
> +
> +	total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks);
> +	if (stats->running) {
> +		u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk;
> +
> +		total += intel_gt_clock_interval_to_ns(gt, clk);
> +	}
> +
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +	if (!reset_in_progress)
> +		intel_gt_reset_unlock(gt, srcu);
> +
> +	return ns_to_ktime(total);
> +}
> +
> +static void __reset_guc_busyness_stats(struct intel_guc *guc)
> +{
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	unsigned long flags;
> +
> +	cancel_delayed_work_sync(&guc->timestamp.work);
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	guc_update_pm_timestamp(guc);
> +	for_each_engine(engine, gt, id) {
> +		guc_update_engine_gt_clks(engine);
> +		engine->stats.guc.prev_total = 0;
> +	}
> +
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +}
> +
> +static void __update_guc_busyness_stats(struct intel_guc *guc)
> +{
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	guc_update_pm_timestamp(guc);
> +	for_each_engine(engine, gt, id)
> +		guc_update_engine_gt_clks(engine);
> +}
> +
> +static void guc_timestamp_ping(struct work_struct *wrk)
> +{
> +	struct intel_guc *guc = container_of(wrk, typeof(*guc),
> +					     timestamp.work.work);
> +	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	intel_wakeref_t wakeref;
> +	unsigned long flags;
> +	int srcu, ret;
> +
> +	/*
> +	 * Synchronize with gt reset to make sure the worker does not
> +	 * corrupt the engine/guc stats.
> +	 */
> +	ret = intel_gt_reset_trylock(gt, &srcu);
> +	if (ret)
> +		return;
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
> +		__update_guc_busyness_stats(guc);
> +
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +
> +	intel_gt_reset_unlock(gt, srcu);
> +
> +	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
> +			 guc->timestamp.ping_delay);
> +}
> +
> +static int guc_action_enable_usage_stats(struct intel_guc *guc)
> +{
> +	u32 offset = intel_guc_engine_usage_offset(guc);
> +	u32 action[] = {
> +		INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF,
> +		offset,
> +		0,
> +	};
> +
> +	return intel_guc_send(guc, action, ARRAY_SIZE(action));
> +}
> +
> +static void guc_init_engine_stats(struct intel_guc *guc)
> +{
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	intel_wakeref_t wakeref;
> +
> +	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
> +			 guc->timestamp.ping_delay);
> +
> +	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref) {
> +		int ret = guc_action_enable_usage_stats(guc);
> +
> +		if (ret)
> +			drm_err(&gt->i915->drm,
> +				"Failed to enable usage stats: %d!\n", ret);
> +	}
> +}
> +
> +void intel_guc_busyness_park(struct intel_gt *gt)
> +{
> +	struct intel_guc *guc = &gt->uc.guc;
> +	unsigned long flags;
> +
> +	cancel_delayed_work(&guc->timestamp.work);
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +	__update_guc_busyness_stats(guc);
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +}
> +
> +void intel_guc_busyness_unpark(struct intel_gt *gt)
> +{
> +	struct intel_guc *guc = &gt->uc.guc;
> +
> +	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
> +			 guc->timestamp.ping_delay);

You will need some sort of "is guc in use" check here.

Regards,

Tvrtko

> +}
> +
>   static inline bool
>   submission_disabled(struct intel_guc *guc)
>   {
> @@ -809,6 +1067,7 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc)
>   	intel_gt_park_heartbeats(guc_to_gt(guc));
>   	disable_submission(guc);
>   	guc->interrupts.disable(guc);
> +	__reset_guc_busyness_stats(guc);
>   
>   	/* Flush IRQ handler */
>   	spin_lock_irq(&guc_to_gt(guc)->irq_lock);
> @@ -1132,6 +1391,7 @@ void intel_guc_submission_reset_finish(struct intel_guc *guc)
>    */
>   int intel_guc_submission_init(struct intel_guc *guc)
>   {
> +	struct intel_gt *gt = guc_to_gt(guc);
>   	int ret;
>   
>   	if (guc->lrc_desc_pool)
> @@ -1152,6 +1412,10 @@ int intel_guc_submission_init(struct intel_guc *guc)
>   	INIT_LIST_HEAD(&guc->guc_id_list);
>   	ida_init(&guc->guc_ids);
>   
> +	spin_lock_init(&guc->timestamp.lock);
> +	INIT_DELAYED_WORK(&guc->timestamp.work, guc_timestamp_ping);
> +	guc->timestamp.ping_delay = (POLL_TIME_CLKS / gt->clock_frequency + 1) * HZ;
> +
>   	return 0;
>   }
>   
> @@ -2606,7 +2870,9 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
>   		engine->emit_flush = gen12_emit_flush_xcs;
>   	}
>   	engine->set_default_submission = guc_set_default_submission;
> +	engine->busyness = guc_engine_busyness;
>   
> +	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
>   	engine->flags |= I915_ENGINE_HAS_PREEMPTION;
>   	engine->flags |= I915_ENGINE_HAS_TIMESLICES;
>   
> @@ -2705,6 +2971,7 @@ int intel_guc_submission_setup(struct intel_engine_cs *engine)
>   void intel_guc_submission_enable(struct intel_guc *guc)
>   {
>   	guc_init_lrc_mapping(guc);
> +	guc_init_engine_stats(guc);
>   }
>   
>   void intel_guc_submission_disable(struct intel_guc *guc)
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> index c7ef44fa0c36..5a95a9f0a8e3 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> @@ -28,6 +28,8 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
>   void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
>   				    struct i915_request *hung_rq,
>   				    struct drm_printer *m);
> +void intel_guc_busyness_park(struct intel_gt *gt);
> +void intel_guc_busyness_unpark(struct intel_gt *gt);
>   
>   bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
>   
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index a897f4abea0c..9aee08425382 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -2664,6 +2664,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>   #define   RING_WAIT		(1 << 11) /* gen3+, PRBx_CTL */
>   #define   RING_WAIT_SEMAPHORE	(1 << 10) /* gen6+ */
>   
> +#define GUCPMTIMESTAMP          _MMIO(0xC3E8)
> +
>   /* There are 16 64-bit CS General Purpose Registers per-engine on Gen8+ */
>   #define GEN8_RING_CS_GPR(base, n)	_MMIO((base) + 0x600 + (n) * 8)
>   #define GEN8_RING_CS_GPR_UDW(base, n)	_MMIO((base) + 0x600 + (n) * 8 + 4)
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-13 16:06   ` Tvrtko Ursulin
@ 2021-10-13 16:27     ` Umesh Nerlige Ramappa
  0 siblings, 0 replies; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-10-13 16:27 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: intel-gfx, dri-devel, john.c.harrison, daniel.vetter, Matthew Brost

On Wed, Oct 13, 2021 at 05:06:26PM +0100, Tvrtko Ursulin wrote:
>
>On 13/10/2021 01:56, Umesh Nerlige Ramappa wrote:
>>With GuC handling scheduling, i915 is not aware of the time that a
>>context is scheduled in and out of the engine. Since i915 pmu relies on
>>this info to provide engine busyness to the user, GuC shares this info
>>with i915 for all engines using shared memory. For each engine, this
>>info contains:
>>
>>- total busyness: total time that the context was running (total)
>>- id: id of the running context (id)
>>- start timestamp: timestamp when the context started running (start)
>>
>>At the time (now) of sampling the engine busyness, if the id is valid
>>(!= ~0), and start is non-zero, then the context is considered to be
>>active and the engine busyness is calculated using the below equation
>>
>>	engine busyness = total + (now - start)
>>
>>All times are obtained from the gt clock base. For inactive contexts,
>>engine busyness is just equal to the total.
>>
>>The start and total values provided by GuC are 32 bits and wrap around
>>in a few minutes. Since perf pmu provides busyness as 64 bit
>>monotonically increasing values, there is a need for this implementation
>>to account for overflows and extend the time to 64 bits before returning
>>busyness to the user. In order to do that, a worker runs periodically at
>>frequency = 1/8th the time it takes for the timestamp to wrap. As an
>>example, that would be once in 27 seconds for a gt clock frequency of
>>19.2 MHz.
>>
>>Note:
>>There might be an overaccounting of busyness due to the fact that GuC
>>may be updating the total and start values while kmd is reading them.
>>(i.e kmd may read the updated total and the stale start). In such a
>>case, user may see higher busyness value followed by smaller ones which
>>would eventually catch up to the higher value.
>>
>>v2: (Tvrtko)
>>- Include details in commit message
>>- Move intel engine busyness function into execlist code
>>- Use union inside engine->stats
>>- Use natural type for ping delay jiffies
>>- Drop active_work condition checks
>>- Use for_each_engine if iterating all engines
>>- Drop seq locking, use spinlock at guc level to update engine stats
>>- Document worker specific details
>>
>>v3: (Tvrtko/Umesh)
>>- Demarcate guc and execlist stat objects with comments
>>- Document known over-accounting issue in commit
>>- Provide a consistent view of guc state
>>- Add hooks to gt park/unpark for guc busyness
>>- Stop/start worker in gt park/unpark path
>>- Drop inline
>>- Move spinlock and worker inits to guc initialization
>>- Drop helpers that are called only once
>>
>>v4: (Tvrtko/Matt/Umesh)
>>- Drop addressed opens from commit message
>>- Get runtime pm in ping, remove from the park path
>>- Use cancel_delayed_work_sync in disable_submission path
>>- Update stats during reset prepare
>>- Skip ping if reset in progress
>>- Explicitly name execlists and guc stats objects
>>- Since disable_submission is called from many places, move resetting
>>   stats to intel_guc_submission_reset_prepare
>>
>>v5: (Tvrtko)
>>- Add a trylock helper that does not sleep and synchronize PMU event
>>   callbacks and worker with gt reset
>
>Looks good to me now, for some combination of high level and 
>incomeplte low level review (I did not check the overflow handling or 
>the GuC page layout and flow.). Both patches:
>
>Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Thanks

>
>Do you have someone available to check the parts I did not and r-b?

I will check with Matt/John.

Regards,
Umesh
>
>Regards,
>
>Tvrtko
>
>>
>>Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>---
>>  drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  28 +-
>>  drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
>>  .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
>>  drivers/gpu/drm/i915/gt/intel_gt_pm.c         |   2 +
>>  drivers/gpu/drm/i915/gt/intel_reset.c         |  16 ++
>>  drivers/gpu/drm/i915/gt/intel_reset.h         |   1 +
>>  .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
>>  drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  30 ++
>>  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  21 ++
>>  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   5 +
>>  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
>>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 267 ++++++++++++++++++
>>  .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
>>  drivers/gpu/drm/i915/i915_reg.h               |   2 +
>>  14 files changed, 427 insertions(+), 28 deletions(-)
>>
>>diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>index 38436f4b5706..6b783fdcba2a 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>@@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>>  	intel_engine_print_breadcrumbs(engine, m);
>>  }
>>-static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
>>-					    ktime_t *now)
>>-{
>>-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
>>-	ktime_t total = stats->total;
>>-
>>-	/*
>>-	 * If the engine is executing something at the moment
>>-	 * add it to the total.
>>-	 */
>>-	*now = ktime_get();
>>-	if (READ_ONCE(stats->active))
>>-		total = ktime_add(total, ktime_sub(*now, stats->start));
>>-
>>-	return total;
>>-}
>>-
>>  /**
>>   * intel_engine_get_busy_time() - Return current accumulated engine busyness
>>   * @engine: engine to report on
>>@@ -1899,16 +1882,7 @@ static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
>>   */
>>  ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now)
>>  {
>>-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
>>-	unsigned int seq;
>>-	ktime_t total;
>>-
>>-	do {
>>-		seq = read_seqcount_begin(&stats->lock);
>>-		total = __intel_engine_get_busy_time(engine, now);
>>-	} while (read_seqcount_retry(&stats->lock, seq));
>>-
>>-	return total;
>>+	return engine->busyness(engine, now);
>>  }
>>  struct intel_context *
>>diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>index b820a2c1124e..9300c65d6675 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>@@ -284,6 +284,28 @@ struct intel_engine_execlists_stats {
>>  	ktime_t start;
>>  };
>>+struct intel_engine_guc_stats {
>>+	/**
>>+	 * @running: Active state of the engine when busyness was last sampled.
>>+	 */
>>+	bool running;
>>+
>>+	/**
>>+	 * @prev_total: Previous value of total runtime clock cycles.
>>+	 */
>>+	u32 prev_total;
>>+
>>+	/**
>>+	 * @total_gt_clks: Total gt clock cycles this engine was busy.
>>+	 */
>>+	u64 total_gt_clks;
>>+
>>+	/**
>>+	 * @start_gt_clk: GT clock time of last idle to active transition.
>>+	 */
>>+	u64 start_gt_clk;
>>+};
>>+
>>  struct intel_engine_cs {
>>  	struct drm_i915_private *i915;
>>  	struct intel_gt *gt;
>>@@ -459,6 +481,12 @@ struct intel_engine_cs {
>>  	void		(*add_active_request)(struct i915_request *rq);
>>  	void		(*remove_active_request)(struct i915_request *rq);
>>+	/*
>>+	 * Get engine busyness and the time at which the busyness was sampled.
>>+	 */
>>+	ktime_t		(*busyness)(struct intel_engine_cs *engine,
>>+				    ktime_t *now);
>>+
>>  	struct intel_engine_execlists execlists;
>>  	/*
>>@@ -508,7 +536,10 @@ struct intel_engine_cs {
>>  	u32 (*get_cmd_length_mask)(u32 cmd_header);
>>  	struct {
>>-		struct intel_engine_execlists_stats execlists;
>>+		union {
>>+			struct intel_engine_execlists_stats execlists;
>>+			struct intel_engine_guc_stats guc;
>>+		};
>>  		/**
>>  		 * @rps: Utilisation at last RPS sampling.
>>diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>index 73a79c2acd3a..e8ffcf36f6f4 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>@@ -3292,6 +3292,38 @@ static void execlists_release(struct intel_engine_cs *engine)
>>  	lrc_fini_wa_ctx(engine);
>>  }
>>+static ktime_t __execlists_engine_busyness(struct intel_engine_cs *engine,
>>+					   ktime_t *now)
>>+{
>>+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
>>+	ktime_t total = stats->total;
>>+
>>+	/*
>>+	 * If the engine is executing something at the moment
>>+	 * add it to the total.
>>+	 */
>>+	*now = ktime_get();
>>+	if (READ_ONCE(stats->active))
>>+		total = ktime_add(total, ktime_sub(*now, stats->start));
>>+
>>+	return total;
>>+}
>>+
>>+static ktime_t execlists_engine_busyness(struct intel_engine_cs *engine,
>>+					 ktime_t *now)
>>+{
>>+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
>>+	unsigned int seq;
>>+	ktime_t total;
>>+
>>+	do {
>>+		seq = read_seqcount_begin(&stats->lock);
>>+		total = __execlists_engine_busyness(engine, now);
>>+	} while (read_seqcount_retry(&stats->lock, seq));
>>+
>>+	return total;
>>+}
>>+
>>  static void
>>  logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>>  {
>>@@ -3348,6 +3380,8 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>>  		engine->emit_bb_start = gen8_emit_bb_start;
>>  	else
>>  		engine->emit_bb_start = gen8_emit_bb_start_noarb;
>>+
>>+	engine->busyness = execlists_engine_busyness;
>>  }
>>  static void logical_ring_default_irqs(struct intel_engine_cs *engine)
>>diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
>>index 524eaf678790..b4a8594bc46c 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
>>+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
>>@@ -86,6 +86,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
>>  	intel_rc6_unpark(&gt->rc6);
>>  	intel_rps_unpark(&gt->rps);
>>  	i915_pmu_gt_unparked(i915);
>>+	intel_guc_busyness_unpark(gt);
>>  	intel_gt_unpark_requests(gt);
>>  	runtime_begin(gt);
>>@@ -104,6 +105,7 @@ static int __gt_park(struct intel_wakeref *wf)
>>  	runtime_end(gt);
>>  	intel_gt_park_requests(gt);
>>+	intel_guc_busyness_park(gt);
>>  	i915_vma_parked(gt);
>>  	i915_pmu_gt_parked(i915);
>>  	intel_rps_park(&gt->rps);
>>diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
>>index 91200c43951f..ac12163c3639 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_reset.c
>>+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
>>@@ -1389,6 +1389,22 @@ void intel_gt_handle_error(struct intel_gt *gt,
>>  	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
>>  }
>>+bool intel_gt_reset_trylock_no_wait(struct intel_gt *gt, int *srcu)
>>+{
>>+	int reset_in_progress;
>>+
>>+	might_lock(&gt->reset.backoff_srcu);
>>+	cant_sleep();
>>+
>>+	rcu_read_lock();
>>+	reset_in_progress = test_bit(I915_RESET_BACKOFF, &gt->reset.flags);
>>+	if (!reset_in_progress)
>>+		*srcu = srcu_read_lock(&gt->reset.backoff_srcu);
>>+	rcu_read_unlock();
>>+
>>+	return reset_in_progress;
>>+}
>>+
>>  int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
>>  {
>>  	might_lock(&gt->reset.backoff_srcu);
>>diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h b/drivers/gpu/drm/i915/gt/intel_reset.h
>>index adc734e67387..4f5f4c00c54f 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_reset.h
>>+++ b/drivers/gpu/drm/i915/gt/intel_reset.h
>>@@ -38,6 +38,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs *engine,
>>  void __i915_request_reset(struct i915_request *rq, bool guilty);
>>+bool __must_check intel_gt_reset_trylock_no_wait(struct intel_gt *gt, int *srcu);
>>  int __must_check intel_gt_reset_trylock(struct intel_gt *gt, int *srcu);
>>  void intel_gt_reset_unlock(struct intel_gt *gt, int tag);
>>diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>>index 8ff582222aff..ff1311d4beff 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>>+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>>@@ -143,6 +143,7 @@ enum intel_guc_action {
>>  	INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506,
>>  	INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
>>  	INTEL_GUC_ACTION_RESET_CLIENT = 0x5507,
>>+	INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
>>  	INTEL_GUC_ACTION_LIMIT
>>  };
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>index 5dd174babf7a..3c3d48c7d5de 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>@@ -104,6 +104,8 @@ struct intel_guc {
>>  	u32 ads_regset_size;
>>  	/** @ads_golden_ctxt_size: size of the golden contexts in the ADS */
>>  	u32 ads_golden_ctxt_size;
>>+	/** @ads_engine_usage_size: size of engine usage in the ADS */
>>+	u32 ads_engine_usage_size;
>>  	/** @lrc_desc_pool: object allocated to hold the GuC LRC descriptor pool */
>>  	struct i915_vma *lrc_desc_pool;
>>@@ -138,6 +140,34 @@ struct intel_guc {
>>  	/** @send_mutex: used to serialize the intel_guc_send actions */
>>  	struct mutex send_mutex;
>>+
>>+	/**
>>+	 * @timestamp: GT timestamp object that stores a copy of the timestamp
>>+	 * and adjusts it for overflow using a worker.
>>+	 */
>>+	struct {
>>+		/**
>>+		 * @lock: Lock protecting the below fields and the engine stats.
>>+		 */
>>+		spinlock_t lock;
>>+
>>+		/**
>>+		 * @gt_stamp: 64 bit extended value of the GT timestamp.
>>+		 */
>>+		u64 gt_stamp;
>>+
>>+		/**
>>+		 * @ping_delay: Period for polling the GT timestamp for
>>+		 * overflow.
>>+		 */
>>+		unsigned long ping_delay;
>>+
>>+		/**
>>+		 * @work: Periodic work to adjust GT timestamp, engine and
>>+		 * context usage for overflows.
>>+		 */
>>+		struct delayed_work work;
>>+	} timestamp;
>>  };
>>  static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>>index 2c6ea64af7ec..ca9ab53999d5 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>>@@ -26,6 +26,8 @@
>>   *      | guc_policies                          |
>>   *      +---------------------------------------+
>>   *      | guc_gt_system_info                    |
>>+ *      +---------------------------------------+
>>+ *      | guc_engine_usage                      |
>>   *      +---------------------------------------+ <== static
>>   *      | guc_mmio_reg[countA] (engine 0.0)     |
>>   *      | guc_mmio_reg[countB] (engine 0.1)     |
>>@@ -47,6 +49,7 @@ struct __guc_ads_blob {
>>  	struct guc_ads ads;
>>  	struct guc_policies policies;
>>  	struct guc_gt_system_info system_info;
>>+	struct guc_engine_usage engine_usage;
>>  	/* From here on, location is dynamic! Refer to above diagram. */
>>  	struct guc_mmio_reg regset[0];
>>  } __packed;
>>@@ -628,3 +631,21 @@ void intel_guc_ads_reset(struct intel_guc *guc)
>>  	guc_ads_private_data_reset(guc);
>>  }
>>+
>>+u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
>>+{
>>+	struct __guc_ads_blob *blob = guc->ads_blob;
>>+	u32 base = intel_guc_ggtt_offset(guc, guc->ads_vma);
>>+	u32 offset = base + ptr_offset(blob, engine_usage);
>>+
>>+	return offset;
>>+}
>>+
>>+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine)
>>+{
>>+	struct intel_guc *guc = &engine->gt->uc.guc;
>>+	struct __guc_ads_blob *blob = guc->ads_blob;
>>+	u8 guc_class = engine_class_to_guc_class(engine->class);
>>+
>>+	return &blob->engine_usage.engines[guc_class][engine->instance];
>>+}
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
>>index 3d85051d57e4..e74c110facff 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
>>@@ -6,8 +6,11 @@
>>  #ifndef _INTEL_GUC_ADS_H_
>>  #define _INTEL_GUC_ADS_H_
>>+#include <linux/types.h>
>>+
>>  struct intel_guc;
>>  struct drm_printer;
>>+struct intel_engine_cs;
>>  int intel_guc_ads_create(struct intel_guc *guc);
>>  void intel_guc_ads_destroy(struct intel_guc *guc);
>>@@ -15,5 +18,7 @@ void intel_guc_ads_init_late(struct intel_guc *guc);
>>  void intel_guc_ads_reset(struct intel_guc *guc);
>>  void intel_guc_ads_print_policy_info(struct intel_guc *guc,
>>  				     struct drm_printer *p);
>>+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine);
>>+u32 intel_guc_engine_usage_offset(struct intel_guc *guc);
>>  #endif
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>index fa4be13c8854..7c9c081670fc 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>@@ -294,6 +294,19 @@ struct guc_ads {
>>  	u32 reserved[15];
>>  } __packed;
>>+/* Engine usage stats */
>>+struct guc_engine_usage_record {
>>+	u32 current_context_index;
>>+	u32 last_switch_in_stamp;
>>+	u32 reserved0;
>>+	u32 total_runtime;
>>+	u32 reserved1[4];
>>+} __packed;
>>+
>>+struct guc_engine_usage {
>>+	struct guc_engine_usage_record engines[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
>>+} __packed;
>>+
>>  /* GuC logging structures */
>>  enum guc_log_buffer_type {
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>index ba0de35f6323..0c2e4d8d8ec3 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>@@ -12,6 +12,7 @@
>>  #include "gt/intel_engine_pm.h"
>>  #include "gt/intel_engine_heartbeat.h"
>>  #include "gt/intel_gt.h"
>>+#include "gt/intel_gt_clock_utils.h"
>>  #include "gt/intel_gt_irq.h"
>>  #include "gt/intel_gt_pm.h"
>>  #include "gt/intel_gt_requests.h"
>>@@ -20,6 +21,7 @@
>>  #include "gt/intel_mocs.h"
>>  #include "gt/intel_ring.h"
>>+#include "intel_guc_ads.h"
>>  #include "intel_guc_submission.h"
>>  #include "i915_drv.h"
>>@@ -750,6 +752,262 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
>>  	xa_unlock_irqrestore(&guc->context_lookup, flags);
>>  }
>>+/*
>>+ * GuC stores busyness stats for each engine at context in/out boundaries. A
>>+ * context 'in' logs execution start time, 'out' adds in -> out delta to total.
>>+ * i915/kmd accesses 'start', 'total' and 'context id' from memory shared with
>>+ * GuC.
>>+ *
>>+ * __i915_pmu_event_read samples engine busyness. When sampling, if context id
>>+ * is valid (!= ~0) and start is non-zero, the engine is considered to be
>>+ * active. For an active engine total busyness = total + (now - start), where
>>+ * 'now' is the time at which the busyness is sampled. For inactive engine,
>>+ * total busyness = total.
>>+ *
>>+ * All times are captured from GUCPMTIMESTAMP reg and are in gt clock domain.
>>+ *
>>+ * The start and total values provided by GuC are 32 bits and wrap around in a
>>+ * few minutes. Since perf pmu provides busyness as 64 bit monotonically
>>+ * increasing ns values, there is a need for this implementation to account for
>>+ * overflows and extend the GuC provided values to 64 bits before returning
>>+ * busyness to the user. In order to do that, a worker runs periodically at
>>+ * frequency = 1/8th the time it takes for the timestamp to wrap (i.e. once in
>>+ * 27 seconds for a gt clock frequency of 19.2 MHz).
>>+ */
>>+
>>+#define WRAP_TIME_CLKS U32_MAX
>>+#define POLL_TIME_CLKS (WRAP_TIME_CLKS >> 3)
>>+
>>+static void
>>+__extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start)
>>+{
>>+	u32 gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
>>+	u32 gt_stamp_last = lower_32_bits(guc->timestamp.gt_stamp);
>>+
>>+	if (new_start == lower_32_bits(*prev_start))
>>+		return;
>>+
>>+	if (new_start < gt_stamp_last &&
>>+	    (new_start - gt_stamp_last) <= POLL_TIME_CLKS)
>>+		gt_stamp_hi++;
>>+
>>+	if (new_start > gt_stamp_last &&
>>+	    (gt_stamp_last - new_start) <= POLL_TIME_CLKS && gt_stamp_hi)
>>+		gt_stamp_hi--;
>>+
>>+	*prev_start = ((u64)gt_stamp_hi << 32) | new_start;
>>+}
>>+
>>+static void guc_update_engine_gt_clks(struct intel_engine_cs *engine)
>>+{
>>+	struct guc_engine_usage_record *rec = intel_guc_engine_usage(engine);
>>+	struct intel_engine_guc_stats *stats = &engine->stats.guc;
>>+	struct intel_guc *guc = &engine->gt->uc.guc;
>>+	u32 last_switch = rec->last_switch_in_stamp;
>>+	u32 ctx_id = rec->current_context_index;
>>+	u32 total = rec->total_runtime;
>>+
>>+	lockdep_assert_held(&guc->timestamp.lock);
>>+
>>+	stats->running = ctx_id != ~0U && last_switch;
>>+	if (stats->running)
>>+		__extend_last_switch(guc, &stats->start_gt_clk, last_switch);
>>+
>>+	/*
>>+	 * Instead of adjusting the total for overflow, just add the
>>+	 * difference from previous sample stats->total_gt_clks
>>+	 */
>>+	if (total && total != ~0U) {
>>+		stats->total_gt_clks += (u32)(total - stats->prev_total);
>>+		stats->prev_total = total;
>>+	}
>>+}
>>+
>>+static void guc_update_pm_timestamp(struct intel_guc *guc)
>>+{
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>+	u32 gt_stamp_now, gt_stamp_hi;
>>+
>>+	lockdep_assert_held(&guc->timestamp.lock);
>>+
>>+	gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
>>+	gt_stamp_now = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
>>+
>>+	if (gt_stamp_now < lower_32_bits(guc->timestamp.gt_stamp))
>>+		gt_stamp_hi++;
>>+
>>+	guc->timestamp.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_now;
>>+}
>>+
>>+/*
>>+ * Unlike the execlist mode of submission total and active times are in terms of
>>+ * gt clocks. The *now parameter is retained to return the cpu time at which the
>>+ * busyness was sampled.
>>+ */
>>+static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
>>+{
>>+	struct intel_engine_guc_stats *stats = &engine->stats.guc;
>>+	struct intel_gt *gt = engine->gt;
>>+	struct intel_guc *guc = &gt->uc.guc;
>>+	unsigned long flags;
>>+	bool reset_in_progress;
>>+	u64 total;
>>+	int srcu;
>>+
>>+	/*
>>+	 * If a reset is in progress, we risk reading partially updated
>>+	 * engine busyness from GuC, so we just use the driver stored
>>+	 * copy of busyness. Synchronize with gt reset lock to achieve
>>+	 * this.
>>+	 */
>>+	reset_in_progress = intel_gt_reset_trylock_no_wait(gt, &srcu);
>>+
>>+	/*
>>+	 * The order of taking the reset lock first and then the
>>+	 * timestamp lock is intentional to avoid lock inversion related
>>+	 * issues.
>>+	 */
>>+	spin_lock_irqsave(&guc->timestamp.lock, flags);
>>+
>>+	*now = ktime_get();
>>+
>>+	/*
>>+	 * The active busyness depends on start_gt_clk and gt_stamp.
>>+	 * gt_stamp is updated by i915 only when gt is awake and the
>>+	 * start_gt_clk is derived from GuC state. To get a consistent
>>+	 * view of activity, we query the GuC state only if gt is awake.
>>+	 */
>>+	if (intel_gt_pm_get_if_awake(gt) && !reset_in_progress) {
>>+		guc_update_engine_gt_clks(engine);
>>+		guc_update_pm_timestamp(guc);
>>+		intel_gt_pm_put_async(gt);
>>+	}
>>+
>>+	total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks);
>>+	if (stats->running) {
>>+		u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk;
>>+
>>+		total += intel_gt_clock_interval_to_ns(gt, clk);
>>+	}
>>+
>>+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>+	if (!reset_in_progress)
>>+		intel_gt_reset_unlock(gt, srcu);
>>+
>>+	return ns_to_ktime(total);
>>+}
>>+
>>+static void __reset_guc_busyness_stats(struct intel_guc *guc)
>>+{
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>+	struct intel_engine_cs *engine;
>>+	enum intel_engine_id id;
>>+	unsigned long flags;
>>+
>>+	cancel_delayed_work_sync(&guc->timestamp.work);
>>+
>>+	spin_lock_irqsave(&guc->timestamp.lock, flags);
>>+
>>+	guc_update_pm_timestamp(guc);
>>+	for_each_engine(engine, gt, id) {
>>+		guc_update_engine_gt_clks(engine);
>>+		engine->stats.guc.prev_total = 0;
>>+	}
>>+
>>+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>+}
>>+
>>+static void __update_guc_busyness_stats(struct intel_guc *guc)
>>+{
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>+	struct intel_engine_cs *engine;
>>+	enum intel_engine_id id;
>>+
>>+	guc_update_pm_timestamp(guc);
>>+	for_each_engine(engine, gt, id)
>>+		guc_update_engine_gt_clks(engine);
>>+}
>>+
>>+static void guc_timestamp_ping(struct work_struct *wrk)
>>+{
>>+	struct intel_guc *guc = container_of(wrk, typeof(*guc),
>>+					     timestamp.work.work);
>>+	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>+	intel_wakeref_t wakeref;
>>+	unsigned long flags;
>>+	int srcu, ret;
>>+
>>+	/*
>>+	 * Synchronize with gt reset to make sure the worker does not
>>+	 * corrupt the engine/guc stats.
>>+	 */
>>+	ret = intel_gt_reset_trylock(gt, &srcu);
>>+	if (ret)
>>+		return;
>>+
>>+	spin_lock_irqsave(&guc->timestamp.lock, flags);
>>+
>>+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>>+		__update_guc_busyness_stats(guc);
>>+
>>+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>+
>>+	intel_gt_reset_unlock(gt, srcu);
>>+
>>+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>+			 guc->timestamp.ping_delay);
>>+}
>>+
>>+static int guc_action_enable_usage_stats(struct intel_guc *guc)
>>+{
>>+	u32 offset = intel_guc_engine_usage_offset(guc);
>>+	u32 action[] = {
>>+		INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF,
>>+		offset,
>>+		0,
>>+	};
>>+
>>+	return intel_guc_send(guc, action, ARRAY_SIZE(action));
>>+}
>>+
>>+static void guc_init_engine_stats(struct intel_guc *guc)
>>+{
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>+	intel_wakeref_t wakeref;
>>+
>>+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>+			 guc->timestamp.ping_delay);
>>+
>>+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref) {
>>+		int ret = guc_action_enable_usage_stats(guc);
>>+
>>+		if (ret)
>>+			drm_err(&gt->i915->drm,
>>+				"Failed to enable usage stats: %d!\n", ret);
>>+	}
>>+}
>>+
>>+void intel_guc_busyness_park(struct intel_gt *gt)
>>+{
>>+	struct intel_guc *guc = &gt->uc.guc;
>>+	unsigned long flags;
>>+
>>+	cancel_delayed_work(&guc->timestamp.work);
>>+
>>+	spin_lock_irqsave(&guc->timestamp.lock, flags);
>>+	__update_guc_busyness_stats(guc);
>>+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>+}
>>+
>>+void intel_guc_busyness_unpark(struct intel_gt *gt)
>>+{
>>+	struct intel_guc *guc = &gt->uc.guc;
>>+
>>+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>+			 guc->timestamp.ping_delay);
>>+}
>>+
>>  static inline bool
>>  submission_disabled(struct intel_guc *guc)
>>  {
>>@@ -809,6 +1067,7 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc)
>>  	intel_gt_park_heartbeats(guc_to_gt(guc));
>>  	disable_submission(guc);
>>  	guc->interrupts.disable(guc);
>>+	__reset_guc_busyness_stats(guc);
>>  	/* Flush IRQ handler */
>>  	spin_lock_irq(&guc_to_gt(guc)->irq_lock);
>>@@ -1132,6 +1391,7 @@ void intel_guc_submission_reset_finish(struct intel_guc *guc)
>>   */
>>  int intel_guc_submission_init(struct intel_guc *guc)
>>  {
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>  	int ret;
>>  	if (guc->lrc_desc_pool)
>>@@ -1152,6 +1412,10 @@ int intel_guc_submission_init(struct intel_guc *guc)
>>  	INIT_LIST_HEAD(&guc->guc_id_list);
>>  	ida_init(&guc->guc_ids);
>>+	spin_lock_init(&guc->timestamp.lock);
>>+	INIT_DELAYED_WORK(&guc->timestamp.work, guc_timestamp_ping);
>>+	guc->timestamp.ping_delay = (POLL_TIME_CLKS / gt->clock_frequency + 1) * HZ;
>>+
>>  	return 0;
>>  }
>>@@ -2606,7 +2870,9 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
>>  		engine->emit_flush = gen12_emit_flush_xcs;
>>  	}
>>  	engine->set_default_submission = guc_set_default_submission;
>>+	engine->busyness = guc_engine_busyness;
>>+	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
>>  	engine->flags |= I915_ENGINE_HAS_PREEMPTION;
>>  	engine->flags |= I915_ENGINE_HAS_TIMESLICES;
>>@@ -2705,6 +2971,7 @@ int intel_guc_submission_setup(struct intel_engine_cs *engine)
>>  void intel_guc_submission_enable(struct intel_guc *guc)
>>  {
>>  	guc_init_lrc_mapping(guc);
>>+	guc_init_engine_stats(guc);
>>  }
>>  void intel_guc_submission_disable(struct intel_guc *guc)
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
>>index c7ef44fa0c36..5a95a9f0a8e3 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
>>@@ -28,6 +28,8 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
>>  void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
>>  				    struct i915_request *hung_rq,
>>  				    struct drm_printer *m);
>>+void intel_guc_busyness_park(struct intel_gt *gt);
>>+void intel_guc_busyness_unpark(struct intel_gt *gt);
>>  bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
>>diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
>>index a897f4abea0c..9aee08425382 100644
>>--- a/drivers/gpu/drm/i915/i915_reg.h
>>+++ b/drivers/gpu/drm/i915/i915_reg.h
>>@@ -2664,6 +2664,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>>  #define   RING_WAIT		(1 << 11) /* gen3+, PRBx_CTL */
>>  #define   RING_WAIT_SEMAPHORE	(1 << 10) /* gen6+ */
>>+#define GUCPMTIMESTAMP          _MMIO(0xC3E8)
>>+
>>  /* There are 16 64-bit CS General Purpose Registers per-engine on Gen8+ */
>>  #define GEN8_RING_CS_GPR(base, n)	_MMIO((base) + 0x600 + (n) * 8)
>>  #define GEN8_RING_CS_GPR_UDW(base, n)	_MMIO((base) + 0x600 + (n) * 8 + 4)
>>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-13  0:56 ` [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu Umesh Nerlige Ramappa
@ 2021-10-13 16:06   ` Tvrtko Ursulin
  2021-10-13 16:27     ` Umesh Nerlige Ramappa
  2021-10-14  8:21   ` Tvrtko Ursulin
  1 sibling, 1 reply; 31+ messages in thread
From: Tvrtko Ursulin @ 2021-10-13 16:06 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa, intel-gfx, dri-devel
  Cc: john.c.harrison, daniel.vetter, Matthew Brost


On 13/10/2021 01:56, Umesh Nerlige Ramappa wrote:
> With GuC handling scheduling, i915 is not aware of the time that a
> context is scheduled in and out of the engine. Since i915 pmu relies on
> this info to provide engine busyness to the user, GuC shares this info
> with i915 for all engines using shared memory. For each engine, this
> info contains:
> 
> - total busyness: total time that the context was running (total)
> - id: id of the running context (id)
> - start timestamp: timestamp when the context started running (start)
> 
> At the time (now) of sampling the engine busyness, if the id is valid
> (!= ~0), and start is non-zero, then the context is considered to be
> active and the engine busyness is calculated using the below equation
> 
> 	engine busyness = total + (now - start)
> 
> All times are obtained from the gt clock base. For inactive contexts,
> engine busyness is just equal to the total.
> 
> The start and total values provided by GuC are 32 bits and wrap around
> in a few minutes. Since perf pmu provides busyness as 64 bit
> monotonically increasing values, there is a need for this implementation
> to account for overflows and extend the time to 64 bits before returning
> busyness to the user. In order to do that, a worker runs periodically at
> frequency = 1/8th the time it takes for the timestamp to wrap. As an
> example, that would be once in 27 seconds for a gt clock frequency of
> 19.2 MHz.
> 
> Note:
> There might be an overaccounting of busyness due to the fact that GuC
> may be updating the total and start values while kmd is reading them.
> (i.e kmd may read the updated total and the stale start). In such a
> case, user may see higher busyness value followed by smaller ones which
> would eventually catch up to the higher value.
> 
> v2: (Tvrtko)
> - Include details in commit message
> - Move intel engine busyness function into execlist code
> - Use union inside engine->stats
> - Use natural type for ping delay jiffies
> - Drop active_work condition checks
> - Use for_each_engine if iterating all engines
> - Drop seq locking, use spinlock at guc level to update engine stats
> - Document worker specific details
> 
> v3: (Tvrtko/Umesh)
> - Demarcate guc and execlist stat objects with comments
> - Document known over-accounting issue in commit
> - Provide a consistent view of guc state
> - Add hooks to gt park/unpark for guc busyness
> - Stop/start worker in gt park/unpark path
> - Drop inline
> - Move spinlock and worker inits to guc initialization
> - Drop helpers that are called only once
> 
> v4: (Tvrtko/Matt/Umesh)
> - Drop addressed opens from commit message
> - Get runtime pm in ping, remove from the park path
> - Use cancel_delayed_work_sync in disable_submission path
> - Update stats during reset prepare
> - Skip ping if reset in progress
> - Explicitly name execlists and guc stats objects
> - Since disable_submission is called from many places, move resetting
>    stats to intel_guc_submission_reset_prepare
> 
> v5: (Tvrtko)
> - Add a trylock helper that does not sleep and synchronize PMU event
>    callbacks and worker with gt reset

Looks good to me now, for some combination of high level and incomeplte 
low level review (I did not check the overflow handling or the GuC page 
layout and flow.). Both patches:

Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Do you have someone available to check the parts I did not and r-b?

Regards,

Tvrtko

> 
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  28 +-
>   drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
>   .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
>   drivers/gpu/drm/i915/gt/intel_gt_pm.c         |   2 +
>   drivers/gpu/drm/i915/gt/intel_reset.c         |  16 ++
>   drivers/gpu/drm/i915/gt/intel_reset.h         |   1 +
>   .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
>   drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  30 ++
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  21 ++
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   5 +
>   drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 267 ++++++++++++++++++
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
>   drivers/gpu/drm/i915/i915_reg.h               |   2 +
>   14 files changed, 427 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 38436f4b5706..6b783fdcba2a 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>   	intel_engine_print_breadcrumbs(engine, m);
>   }
>   
> -static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
> -					    ktime_t *now)
> -{
> -	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
> -	ktime_t total = stats->total;
> -
> -	/*
> -	 * If the engine is executing something at the moment
> -	 * add it to the total.
> -	 */
> -	*now = ktime_get();
> -	if (READ_ONCE(stats->active))
> -		total = ktime_add(total, ktime_sub(*now, stats->start));
> -
> -	return total;
> -}
> -
>   /**
>    * intel_engine_get_busy_time() - Return current accumulated engine busyness
>    * @engine: engine to report on
> @@ -1899,16 +1882,7 @@ static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
>    */
>   ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now)
>   {
> -	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
> -	unsigned int seq;
> -	ktime_t total;
> -
> -	do {
> -		seq = read_seqcount_begin(&stats->lock);
> -		total = __intel_engine_get_busy_time(engine, now);
> -	} while (read_seqcount_retry(&stats->lock, seq));
> -
> -	return total;
> +	return engine->busyness(engine, now);
>   }
>   
>   struct intel_context *
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index b820a2c1124e..9300c65d6675 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -284,6 +284,28 @@ struct intel_engine_execlists_stats {
>   	ktime_t start;
>   };
>   
> +struct intel_engine_guc_stats {
> +	/**
> +	 * @running: Active state of the engine when busyness was last sampled.
> +	 */
> +	bool running;
> +
> +	/**
> +	 * @prev_total: Previous value of total runtime clock cycles.
> +	 */
> +	u32 prev_total;
> +
> +	/**
> +	 * @total_gt_clks: Total gt clock cycles this engine was busy.
> +	 */
> +	u64 total_gt_clks;
> +
> +	/**
> +	 * @start_gt_clk: GT clock time of last idle to active transition.
> +	 */
> +	u64 start_gt_clk;
> +};
> +
>   struct intel_engine_cs {
>   	struct drm_i915_private *i915;
>   	struct intel_gt *gt;
> @@ -459,6 +481,12 @@ struct intel_engine_cs {
>   	void		(*add_active_request)(struct i915_request *rq);
>   	void		(*remove_active_request)(struct i915_request *rq);
>   
> +	/*
> +	 * Get engine busyness and the time at which the busyness was sampled.
> +	 */
> +	ktime_t		(*busyness)(struct intel_engine_cs *engine,
> +				    ktime_t *now);
> +
>   	struct intel_engine_execlists execlists;
>   
>   	/*
> @@ -508,7 +536,10 @@ struct intel_engine_cs {
>   	u32 (*get_cmd_length_mask)(u32 cmd_header);
>   
>   	struct {
> -		struct intel_engine_execlists_stats execlists;
> +		union {
> +			struct intel_engine_execlists_stats execlists;
> +			struct intel_engine_guc_stats guc;
> +		};
>   
>   		/**
>   		 * @rps: Utilisation at last RPS sampling.
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 73a79c2acd3a..e8ffcf36f6f4 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -3292,6 +3292,38 @@ static void execlists_release(struct intel_engine_cs *engine)
>   	lrc_fini_wa_ctx(engine);
>   }
>   
> +static ktime_t __execlists_engine_busyness(struct intel_engine_cs *engine,
> +					   ktime_t *now)
> +{
> +	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
> +	ktime_t total = stats->total;
> +
> +	/*
> +	 * If the engine is executing something at the moment
> +	 * add it to the total.
> +	 */
> +	*now = ktime_get();
> +	if (READ_ONCE(stats->active))
> +		total = ktime_add(total, ktime_sub(*now, stats->start));
> +
> +	return total;
> +}
> +
> +static ktime_t execlists_engine_busyness(struct intel_engine_cs *engine,
> +					 ktime_t *now)
> +{
> +	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
> +	unsigned int seq;
> +	ktime_t total;
> +
> +	do {
> +		seq = read_seqcount_begin(&stats->lock);
> +		total = __execlists_engine_busyness(engine, now);
> +	} while (read_seqcount_retry(&stats->lock, seq));
> +
> +	return total;
> +}
> +
>   static void
>   logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>   {
> @@ -3348,6 +3380,8 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>   		engine->emit_bb_start = gen8_emit_bb_start;
>   	else
>   		engine->emit_bb_start = gen8_emit_bb_start_noarb;
> +
> +	engine->busyness = execlists_engine_busyness;
>   }
>   
>   static void logical_ring_default_irqs(struct intel_engine_cs *engine)
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> index 524eaf678790..b4a8594bc46c 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> @@ -86,6 +86,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
>   	intel_rc6_unpark(&gt->rc6);
>   	intel_rps_unpark(&gt->rps);
>   	i915_pmu_gt_unparked(i915);
> +	intel_guc_busyness_unpark(gt);
>   
>   	intel_gt_unpark_requests(gt);
>   	runtime_begin(gt);
> @@ -104,6 +105,7 @@ static int __gt_park(struct intel_wakeref *wf)
>   	runtime_end(gt);
>   	intel_gt_park_requests(gt);
>   
> +	intel_guc_busyness_park(gt);
>   	i915_vma_parked(gt);
>   	i915_pmu_gt_parked(i915);
>   	intel_rps_park(&gt->rps);
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index 91200c43951f..ac12163c3639 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -1389,6 +1389,22 @@ void intel_gt_handle_error(struct intel_gt *gt,
>   	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
>   }
>   
> +bool intel_gt_reset_trylock_no_wait(struct intel_gt *gt, int *srcu)
> +{
> +	int reset_in_progress;
> +
> +	might_lock(&gt->reset.backoff_srcu);
> +	cant_sleep();
> +
> +	rcu_read_lock();
> +	reset_in_progress = test_bit(I915_RESET_BACKOFF, &gt->reset.flags);
> +	if (!reset_in_progress)
> +		*srcu = srcu_read_lock(&gt->reset.backoff_srcu);
> +	rcu_read_unlock();
> +
> +	return reset_in_progress;
> +}
> +
>   int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
>   {
>   	might_lock(&gt->reset.backoff_srcu);
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h b/drivers/gpu/drm/i915/gt/intel_reset.h
> index adc734e67387..4f5f4c00c54f 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.h
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.h
> @@ -38,6 +38,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs *engine,
>   
>   void __i915_request_reset(struct i915_request *rq, bool guilty);
>   
> +bool __must_check intel_gt_reset_trylock_no_wait(struct intel_gt *gt, int *srcu);
>   int __must_check intel_gt_reset_trylock(struct intel_gt *gt, int *srcu);
>   void intel_gt_reset_unlock(struct intel_gt *gt, int tag);
>   
> diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
> index 8ff582222aff..ff1311d4beff 100644
> --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
> +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
> @@ -143,6 +143,7 @@ enum intel_guc_action {
>   	INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506,
>   	INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
>   	INTEL_GUC_ACTION_RESET_CLIENT = 0x5507,
> +	INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
>   	INTEL_GUC_ACTION_LIMIT
>   };
>   
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> index 5dd174babf7a..3c3d48c7d5de 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> @@ -104,6 +104,8 @@ struct intel_guc {
>   	u32 ads_regset_size;
>   	/** @ads_golden_ctxt_size: size of the golden contexts in the ADS */
>   	u32 ads_golden_ctxt_size;
> +	/** @ads_engine_usage_size: size of engine usage in the ADS */
> +	u32 ads_engine_usage_size;
>   
>   	/** @lrc_desc_pool: object allocated to hold the GuC LRC descriptor pool */
>   	struct i915_vma *lrc_desc_pool;
> @@ -138,6 +140,34 @@ struct intel_guc {
>   
>   	/** @send_mutex: used to serialize the intel_guc_send actions */
>   	struct mutex send_mutex;
> +
> +	/**
> +	 * @timestamp: GT timestamp object that stores a copy of the timestamp
> +	 * and adjusts it for overflow using a worker.
> +	 */
> +	struct {
> +		/**
> +		 * @lock: Lock protecting the below fields and the engine stats.
> +		 */
> +		spinlock_t lock;
> +
> +		/**
> +		 * @gt_stamp: 64 bit extended value of the GT timestamp.
> +		 */
> +		u64 gt_stamp;
> +
> +		/**
> +		 * @ping_delay: Period for polling the GT timestamp for
> +		 * overflow.
> +		 */
> +		unsigned long ping_delay;
> +
> +		/**
> +		 * @work: Periodic work to adjust GT timestamp, engine and
> +		 * context usage for overflows.
> +		 */
> +		struct delayed_work work;
> +	} timestamp;
>   };
>   
>   static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
> index 2c6ea64af7ec..ca9ab53999d5 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
> @@ -26,6 +26,8 @@
>    *      | guc_policies                          |
>    *      +---------------------------------------+
>    *      | guc_gt_system_info                    |
> + *      +---------------------------------------+
> + *      | guc_engine_usage                      |
>    *      +---------------------------------------+ <== static
>    *      | guc_mmio_reg[countA] (engine 0.0)     |
>    *      | guc_mmio_reg[countB] (engine 0.1)     |
> @@ -47,6 +49,7 @@ struct __guc_ads_blob {
>   	struct guc_ads ads;
>   	struct guc_policies policies;
>   	struct guc_gt_system_info system_info;
> +	struct guc_engine_usage engine_usage;
>   	/* From here on, location is dynamic! Refer to above diagram. */
>   	struct guc_mmio_reg regset[0];
>   } __packed;
> @@ -628,3 +631,21 @@ void intel_guc_ads_reset(struct intel_guc *guc)
>   
>   	guc_ads_private_data_reset(guc);
>   }
> +
> +u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
> +{
> +	struct __guc_ads_blob *blob = guc->ads_blob;
> +	u32 base = intel_guc_ggtt_offset(guc, guc->ads_vma);
> +	u32 offset = base + ptr_offset(blob, engine_usage);
> +
> +	return offset;
> +}
> +
> +struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine)
> +{
> +	struct intel_guc *guc = &engine->gt->uc.guc;
> +	struct __guc_ads_blob *blob = guc->ads_blob;
> +	u8 guc_class = engine_class_to_guc_class(engine->class);
> +
> +	return &blob->engine_usage.engines[guc_class][engine->instance];
> +}
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
> index 3d85051d57e4..e74c110facff 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
> @@ -6,8 +6,11 @@
>   #ifndef _INTEL_GUC_ADS_H_
>   #define _INTEL_GUC_ADS_H_
>   
> +#include <linux/types.h>
> +
>   struct intel_guc;
>   struct drm_printer;
> +struct intel_engine_cs;
>   
>   int intel_guc_ads_create(struct intel_guc *guc);
>   void intel_guc_ads_destroy(struct intel_guc *guc);
> @@ -15,5 +18,7 @@ void intel_guc_ads_init_late(struct intel_guc *guc);
>   void intel_guc_ads_reset(struct intel_guc *guc);
>   void intel_guc_ads_print_policy_info(struct intel_guc *guc,
>   				     struct drm_printer *p);
> +struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine);
> +u32 intel_guc_engine_usage_offset(struct intel_guc *guc);
>   
>   #endif
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> index fa4be13c8854..7c9c081670fc 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> @@ -294,6 +294,19 @@ struct guc_ads {
>   	u32 reserved[15];
>   } __packed;
>   
> +/* Engine usage stats */
> +struct guc_engine_usage_record {
> +	u32 current_context_index;
> +	u32 last_switch_in_stamp;
> +	u32 reserved0;
> +	u32 total_runtime;
> +	u32 reserved1[4];
> +} __packed;
> +
> +struct guc_engine_usage {
> +	struct guc_engine_usage_record engines[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
> +} __packed;
> +
>   /* GuC logging structures */
>   
>   enum guc_log_buffer_type {
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index ba0de35f6323..0c2e4d8d8ec3 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -12,6 +12,7 @@
>   #include "gt/intel_engine_pm.h"
>   #include "gt/intel_engine_heartbeat.h"
>   #include "gt/intel_gt.h"
> +#include "gt/intel_gt_clock_utils.h"
>   #include "gt/intel_gt_irq.h"
>   #include "gt/intel_gt_pm.h"
>   #include "gt/intel_gt_requests.h"
> @@ -20,6 +21,7 @@
>   #include "gt/intel_mocs.h"
>   #include "gt/intel_ring.h"
>   
> +#include "intel_guc_ads.h"
>   #include "intel_guc_submission.h"
>   
>   #include "i915_drv.h"
> @@ -750,6 +752,262 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
>   	xa_unlock_irqrestore(&guc->context_lookup, flags);
>   }
>   
> +/*
> + * GuC stores busyness stats for each engine at context in/out boundaries. A
> + * context 'in' logs execution start time, 'out' adds in -> out delta to total.
> + * i915/kmd accesses 'start', 'total' and 'context id' from memory shared with
> + * GuC.
> + *
> + * __i915_pmu_event_read samples engine busyness. When sampling, if context id
> + * is valid (!= ~0) and start is non-zero, the engine is considered to be
> + * active. For an active engine total busyness = total + (now - start), where
> + * 'now' is the time at which the busyness is sampled. For inactive engine,
> + * total busyness = total.
> + *
> + * All times are captured from GUCPMTIMESTAMP reg and are in gt clock domain.
> + *
> + * The start and total values provided by GuC are 32 bits and wrap around in a
> + * few minutes. Since perf pmu provides busyness as 64 bit monotonically
> + * increasing ns values, there is a need for this implementation to account for
> + * overflows and extend the GuC provided values to 64 bits before returning
> + * busyness to the user. In order to do that, a worker runs periodically at
> + * frequency = 1/8th the time it takes for the timestamp to wrap (i.e. once in
> + * 27 seconds for a gt clock frequency of 19.2 MHz).
> + */
> +
> +#define WRAP_TIME_CLKS U32_MAX
> +#define POLL_TIME_CLKS (WRAP_TIME_CLKS >> 3)
> +
> +static void
> +__extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start)
> +{
> +	u32 gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
> +	u32 gt_stamp_last = lower_32_bits(guc->timestamp.gt_stamp);
> +
> +	if (new_start == lower_32_bits(*prev_start))
> +		return;
> +
> +	if (new_start < gt_stamp_last &&
> +	    (new_start - gt_stamp_last) <= POLL_TIME_CLKS)
> +		gt_stamp_hi++;
> +
> +	if (new_start > gt_stamp_last &&
> +	    (gt_stamp_last - new_start) <= POLL_TIME_CLKS && gt_stamp_hi)
> +		gt_stamp_hi--;
> +
> +	*prev_start = ((u64)gt_stamp_hi << 32) | new_start;
> +}
> +
> +static void guc_update_engine_gt_clks(struct intel_engine_cs *engine)
> +{
> +	struct guc_engine_usage_record *rec = intel_guc_engine_usage(engine);
> +	struct intel_engine_guc_stats *stats = &engine->stats.guc;
> +	struct intel_guc *guc = &engine->gt->uc.guc;
> +	u32 last_switch = rec->last_switch_in_stamp;
> +	u32 ctx_id = rec->current_context_index;
> +	u32 total = rec->total_runtime;
> +
> +	lockdep_assert_held(&guc->timestamp.lock);
> +
> +	stats->running = ctx_id != ~0U && last_switch;
> +	if (stats->running)
> +		__extend_last_switch(guc, &stats->start_gt_clk, last_switch);
> +
> +	/*
> +	 * Instead of adjusting the total for overflow, just add the
> +	 * difference from previous sample stats->total_gt_clks
> +	 */
> +	if (total && total != ~0U) {
> +		stats->total_gt_clks += (u32)(total - stats->prev_total);
> +		stats->prev_total = total;
> +	}
> +}
> +
> +static void guc_update_pm_timestamp(struct intel_guc *guc)
> +{
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	u32 gt_stamp_now, gt_stamp_hi;
> +
> +	lockdep_assert_held(&guc->timestamp.lock);
> +
> +	gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
> +	gt_stamp_now = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
> +
> +	if (gt_stamp_now < lower_32_bits(guc->timestamp.gt_stamp))
> +		gt_stamp_hi++;
> +
> +	guc->timestamp.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_now;
> +}
> +
> +/*
> + * Unlike the execlist mode of submission total and active times are in terms of
> + * gt clocks. The *now parameter is retained to return the cpu time at which the
> + * busyness was sampled.
> + */
> +static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
> +{
> +	struct intel_engine_guc_stats *stats = &engine->stats.guc;
> +	struct intel_gt *gt = engine->gt;
> +	struct intel_guc *guc = &gt->uc.guc;
> +	unsigned long flags;
> +	bool reset_in_progress;
> +	u64 total;
> +	int srcu;
> +
> +	/*
> +	 * If a reset is in progress, we risk reading partially updated
> +	 * engine busyness from GuC, so we just use the driver stored
> +	 * copy of busyness. Synchronize with gt reset lock to achieve
> +	 * this.
> +	 */
> +	reset_in_progress = intel_gt_reset_trylock_no_wait(gt, &srcu);
> +
> +	/*
> +	 * The order of taking the reset lock first and then the
> +	 * timestamp lock is intentional to avoid lock inversion related
> +	 * issues.
> +	 */
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	*now = ktime_get();
> +
> +	/*
> +	 * The active busyness depends on start_gt_clk and gt_stamp.
> +	 * gt_stamp is updated by i915 only when gt is awake and the
> +	 * start_gt_clk is derived from GuC state. To get a consistent
> +	 * view of activity, we query the GuC state only if gt is awake.
> +	 */
> +	if (intel_gt_pm_get_if_awake(gt) && !reset_in_progress) {
> +		guc_update_engine_gt_clks(engine);
> +		guc_update_pm_timestamp(guc);
> +		intel_gt_pm_put_async(gt);
> +	}
> +
> +	total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks);
> +	if (stats->running) {
> +		u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk;
> +
> +		total += intel_gt_clock_interval_to_ns(gt, clk);
> +	}
> +
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +	if (!reset_in_progress)
> +		intel_gt_reset_unlock(gt, srcu);
> +
> +	return ns_to_ktime(total);
> +}
> +
> +static void __reset_guc_busyness_stats(struct intel_guc *guc)
> +{
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	unsigned long flags;
> +
> +	cancel_delayed_work_sync(&guc->timestamp.work);
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	guc_update_pm_timestamp(guc);
> +	for_each_engine(engine, gt, id) {
> +		guc_update_engine_gt_clks(engine);
> +		engine->stats.guc.prev_total = 0;
> +	}
> +
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +}
> +
> +static void __update_guc_busyness_stats(struct intel_guc *guc)
> +{
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	guc_update_pm_timestamp(guc);
> +	for_each_engine(engine, gt, id)
> +		guc_update_engine_gt_clks(engine);
> +}
> +
> +static void guc_timestamp_ping(struct work_struct *wrk)
> +{
> +	struct intel_guc *guc = container_of(wrk, typeof(*guc),
> +					     timestamp.work.work);
> +	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	intel_wakeref_t wakeref;
> +	unsigned long flags;
> +	int srcu, ret;
> +
> +	/*
> +	 * Synchronize with gt reset to make sure the worker does not
> +	 * corrupt the engine/guc stats.
> +	 */
> +	ret = intel_gt_reset_trylock(gt, &srcu);
> +	if (ret)
> +		return;
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
> +		__update_guc_busyness_stats(guc);
> +
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +
> +	intel_gt_reset_unlock(gt, srcu);
> +
> +	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
> +			 guc->timestamp.ping_delay);
> +}
> +
> +static int guc_action_enable_usage_stats(struct intel_guc *guc)
> +{
> +	u32 offset = intel_guc_engine_usage_offset(guc);
> +	u32 action[] = {
> +		INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF,
> +		offset,
> +		0,
> +	};
> +
> +	return intel_guc_send(guc, action, ARRAY_SIZE(action));
> +}
> +
> +static void guc_init_engine_stats(struct intel_guc *guc)
> +{
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	intel_wakeref_t wakeref;
> +
> +	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
> +			 guc->timestamp.ping_delay);
> +
> +	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref) {
> +		int ret = guc_action_enable_usage_stats(guc);
> +
> +		if (ret)
> +			drm_err(&gt->i915->drm,
> +				"Failed to enable usage stats: %d!\n", ret);
> +	}
> +}
> +
> +void intel_guc_busyness_park(struct intel_gt *gt)
> +{
> +	struct intel_guc *guc = &gt->uc.guc;
> +	unsigned long flags;
> +
> +	cancel_delayed_work(&guc->timestamp.work);
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +	__update_guc_busyness_stats(guc);
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +}
> +
> +void intel_guc_busyness_unpark(struct intel_gt *gt)
> +{
> +	struct intel_guc *guc = &gt->uc.guc;
> +
> +	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
> +			 guc->timestamp.ping_delay);
> +}
> +
>   static inline bool
>   submission_disabled(struct intel_guc *guc)
>   {
> @@ -809,6 +1067,7 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc)
>   	intel_gt_park_heartbeats(guc_to_gt(guc));
>   	disable_submission(guc);
>   	guc->interrupts.disable(guc);
> +	__reset_guc_busyness_stats(guc);
>   
>   	/* Flush IRQ handler */
>   	spin_lock_irq(&guc_to_gt(guc)->irq_lock);
> @@ -1132,6 +1391,7 @@ void intel_guc_submission_reset_finish(struct intel_guc *guc)
>    */
>   int intel_guc_submission_init(struct intel_guc *guc)
>   {
> +	struct intel_gt *gt = guc_to_gt(guc);
>   	int ret;
>   
>   	if (guc->lrc_desc_pool)
> @@ -1152,6 +1412,10 @@ int intel_guc_submission_init(struct intel_guc *guc)
>   	INIT_LIST_HEAD(&guc->guc_id_list);
>   	ida_init(&guc->guc_ids);
>   
> +	spin_lock_init(&guc->timestamp.lock);
> +	INIT_DELAYED_WORK(&guc->timestamp.work, guc_timestamp_ping);
> +	guc->timestamp.ping_delay = (POLL_TIME_CLKS / gt->clock_frequency + 1) * HZ;
> +
>   	return 0;
>   }
>   
> @@ -2606,7 +2870,9 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
>   		engine->emit_flush = gen12_emit_flush_xcs;
>   	}
>   	engine->set_default_submission = guc_set_default_submission;
> +	engine->busyness = guc_engine_busyness;
>   
> +	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
>   	engine->flags |= I915_ENGINE_HAS_PREEMPTION;
>   	engine->flags |= I915_ENGINE_HAS_TIMESLICES;
>   
> @@ -2705,6 +2971,7 @@ int intel_guc_submission_setup(struct intel_engine_cs *engine)
>   void intel_guc_submission_enable(struct intel_guc *guc)
>   {
>   	guc_init_lrc_mapping(guc);
> +	guc_init_engine_stats(guc);
>   }
>   
>   void intel_guc_submission_disable(struct intel_guc *guc)
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> index c7ef44fa0c36..5a95a9f0a8e3 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> @@ -28,6 +28,8 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
>   void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
>   				    struct i915_request *hung_rq,
>   				    struct drm_printer *m);
> +void intel_guc_busyness_park(struct intel_gt *gt);
> +void intel_guc_busyness_unpark(struct intel_gt *gt);
>   
>   bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
>   
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index a897f4abea0c..9aee08425382 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -2664,6 +2664,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>   #define   RING_WAIT		(1 << 11) /* gen3+, PRBx_CTL */
>   #define   RING_WAIT_SEMAPHORE	(1 << 10) /* gen6+ */
>   
> +#define GUCPMTIMESTAMP          _MMIO(0xC3E8)
> +
>   /* There are 16 64-bit CS General Purpose Registers per-engine on Gen8+ */
>   #define GEN8_RING_CS_GPR(base, n)	_MMIO((base) + 0x600 + (n) * 8)
>   #define GEN8_RING_CS_GPR_UDW(base, n)	_MMIO((base) + 0x600 + (n) * 8 + 4)
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-13  0:56 [PATCH 1/2] drm/i915/pmu: Add a name to the execlists stats Umesh Nerlige Ramappa
@ 2021-10-13  0:56 ` Umesh Nerlige Ramappa
  2021-10-13 16:06   ` Tvrtko Ursulin
  2021-10-14  8:21   ` Tvrtko Ursulin
  0 siblings, 2 replies; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-10-13  0:56 UTC (permalink / raw)
  To: intel-gfx, dri-devel
  Cc: john.c.harrison, Tvrtko Ursulin, daniel.vetter, Matthew Brost

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

	engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an overaccounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at guc level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate guc and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of guc state
- Add hooks to gt park/unpark for guc busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to guc initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and guc stats objects
- Since disable_submission is called from many places, move resetting
  stats to intel_guc_submission_reset_prepare

v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
  callbacks and worker with gt reset

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  28 +-
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
 .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
 drivers/gpu/drm/i915/gt/intel_gt_pm.c         |   2 +
 drivers/gpu/drm/i915/gt/intel_reset.c         |  16 ++
 drivers/gpu/drm/i915/gt/intel_reset.h         |   1 +
 .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
 drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  30 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  21 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   5 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 267 ++++++++++++++++++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
 drivers/gpu/drm/i915/i915_reg.h               |   2 +
 14 files changed, 427 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 38436f4b5706..6b783fdcba2a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	intel_engine_print_breadcrumbs(engine, m);
 }
 
-static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
-					    ktime_t *now)
-{
-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
-	ktime_t total = stats->total;
-
-	/*
-	 * If the engine is executing something at the moment
-	 * add it to the total.
-	 */
-	*now = ktime_get();
-	if (READ_ONCE(stats->active))
-		total = ktime_add(total, ktime_sub(*now, stats->start));
-
-	return total;
-}
-
 /**
  * intel_engine_get_busy_time() - Return current accumulated engine busyness
  * @engine: engine to report on
@@ -1899,16 +1882,7 @@ static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
  */
 ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now)
 {
-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
-	unsigned int seq;
-	ktime_t total;
-
-	do {
-		seq = read_seqcount_begin(&stats->lock);
-		total = __intel_engine_get_busy_time(engine, now);
-	} while (read_seqcount_retry(&stats->lock, seq));
-
-	return total;
+	return engine->busyness(engine, now);
 }
 
 struct intel_context *
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index b820a2c1124e..9300c65d6675 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -284,6 +284,28 @@ struct intel_engine_execlists_stats {
 	ktime_t start;
 };
 
+struct intel_engine_guc_stats {
+	/**
+	 * @running: Active state of the engine when busyness was last sampled.
+	 */
+	bool running;
+
+	/**
+	 * @prev_total: Previous value of total runtime clock cycles.
+	 */
+	u32 prev_total;
+
+	/**
+	 * @total_gt_clks: Total gt clock cycles this engine was busy.
+	 */
+	u64 total_gt_clks;
+
+	/**
+	 * @start_gt_clk: GT clock time of last idle to active transition.
+	 */
+	u64 start_gt_clk;
+};
+
 struct intel_engine_cs {
 	struct drm_i915_private *i915;
 	struct intel_gt *gt;
@@ -459,6 +481,12 @@ struct intel_engine_cs {
 	void		(*add_active_request)(struct i915_request *rq);
 	void		(*remove_active_request)(struct i915_request *rq);
 
+	/*
+	 * Get engine busyness and the time at which the busyness was sampled.
+	 */
+	ktime_t		(*busyness)(struct intel_engine_cs *engine,
+				    ktime_t *now);
+
 	struct intel_engine_execlists execlists;
 
 	/*
@@ -508,7 +536,10 @@ struct intel_engine_cs {
 	u32 (*get_cmd_length_mask)(u32 cmd_header);
 
 	struct {
-		struct intel_engine_execlists_stats execlists;
+		union {
+			struct intel_engine_execlists_stats execlists;
+			struct intel_engine_guc_stats guc;
+		};
 
 		/**
 		 * @rps: Utilisation at last RPS sampling.
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 73a79c2acd3a..e8ffcf36f6f4 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -3292,6 +3292,38 @@ static void execlists_release(struct intel_engine_cs *engine)
 	lrc_fini_wa_ctx(engine);
 }
 
+static ktime_t __execlists_engine_busyness(struct intel_engine_cs *engine,
+					   ktime_t *now)
+{
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
+	ktime_t total = stats->total;
+
+	/*
+	 * If the engine is executing something at the moment
+	 * add it to the total.
+	 */
+	*now = ktime_get();
+	if (READ_ONCE(stats->active))
+		total = ktime_add(total, ktime_sub(*now, stats->start));
+
+	return total;
+}
+
+static ktime_t execlists_engine_busyness(struct intel_engine_cs *engine,
+					 ktime_t *now)
+{
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
+	unsigned int seq;
+	ktime_t total;
+
+	do {
+		seq = read_seqcount_begin(&stats->lock);
+		total = __execlists_engine_busyness(engine, now);
+	} while (read_seqcount_retry(&stats->lock, seq));
+
+	return total;
+}
+
 static void
 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 {
@@ -3348,6 +3380,8 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 		engine->emit_bb_start = gen8_emit_bb_start;
 	else
 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
+
+	engine->busyness = execlists_engine_busyness;
 }
 
 static void logical_ring_default_irqs(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 524eaf678790..b4a8594bc46c 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -86,6 +86,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
 	intel_rc6_unpark(&gt->rc6);
 	intel_rps_unpark(&gt->rps);
 	i915_pmu_gt_unparked(i915);
+	intel_guc_busyness_unpark(gt);
 
 	intel_gt_unpark_requests(gt);
 	runtime_begin(gt);
@@ -104,6 +105,7 @@ static int __gt_park(struct intel_wakeref *wf)
 	runtime_end(gt);
 	intel_gt_park_requests(gt);
 
+	intel_guc_busyness_park(gt);
 	i915_vma_parked(gt);
 	i915_pmu_gt_parked(i915);
 	intel_rps_park(&gt->rps);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index 91200c43951f..ac12163c3639 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -1389,6 +1389,22 @@ void intel_gt_handle_error(struct intel_gt *gt,
 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
 }
 
+bool intel_gt_reset_trylock_no_wait(struct intel_gt *gt, int *srcu)
+{
+	int reset_in_progress;
+
+	might_lock(&gt->reset.backoff_srcu);
+	cant_sleep();
+
+	rcu_read_lock();
+	reset_in_progress = test_bit(I915_RESET_BACKOFF, &gt->reset.flags);
+	if (!reset_in_progress)
+		*srcu = srcu_read_lock(&gt->reset.backoff_srcu);
+	rcu_read_unlock();
+
+	return reset_in_progress;
+}
+
 int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
 {
 	might_lock(&gt->reset.backoff_srcu);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h b/drivers/gpu/drm/i915/gt/intel_reset.h
index adc734e67387..4f5f4c00c54f 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.h
+++ b/drivers/gpu/drm/i915/gt/intel_reset.h
@@ -38,6 +38,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs *engine,
 
 void __i915_request_reset(struct i915_request *rq, bool guilty);
 
+bool __must_check intel_gt_reset_trylock_no_wait(struct intel_gt *gt, int *srcu);
 int __must_check intel_gt_reset_trylock(struct intel_gt *gt, int *srcu);
 void intel_gt_reset_unlock(struct intel_gt *gt, int tag);
 
diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
index 8ff582222aff..ff1311d4beff 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
@@ -143,6 +143,7 @@ enum intel_guc_action {
 	INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506,
 	INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
 	INTEL_GUC_ACTION_RESET_CLIENT = 0x5507,
+	INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
 	INTEL_GUC_ACTION_LIMIT
 };
 
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index 5dd174babf7a..3c3d48c7d5de 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -104,6 +104,8 @@ struct intel_guc {
 	u32 ads_regset_size;
 	/** @ads_golden_ctxt_size: size of the golden contexts in the ADS */
 	u32 ads_golden_ctxt_size;
+	/** @ads_engine_usage_size: size of engine usage in the ADS */
+	u32 ads_engine_usage_size;
 
 	/** @lrc_desc_pool: object allocated to hold the GuC LRC descriptor pool */
 	struct i915_vma *lrc_desc_pool;
@@ -138,6 +140,34 @@ struct intel_guc {
 
 	/** @send_mutex: used to serialize the intel_guc_send actions */
 	struct mutex send_mutex;
+
+	/**
+	 * @timestamp: GT timestamp object that stores a copy of the timestamp
+	 * and adjusts it for overflow using a worker.
+	 */
+	struct {
+		/**
+		 * @lock: Lock protecting the below fields and the engine stats.
+		 */
+		spinlock_t lock;
+
+		/**
+		 * @gt_stamp: 64 bit extended value of the GT timestamp.
+		 */
+		u64 gt_stamp;
+
+		/**
+		 * @ping_delay: Period for polling the GT timestamp for
+		 * overflow.
+		 */
+		unsigned long ping_delay;
+
+		/**
+		 * @work: Periodic work to adjust GT timestamp, engine and
+		 * context usage for overflows.
+		 */
+		struct delayed_work work;
+	} timestamp;
 };
 
 static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
index 2c6ea64af7ec..ca9ab53999d5 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
@@ -26,6 +26,8 @@
  *      | guc_policies                          |
  *      +---------------------------------------+
  *      | guc_gt_system_info                    |
+ *      +---------------------------------------+
+ *      | guc_engine_usage                      |
  *      +---------------------------------------+ <== static
  *      | guc_mmio_reg[countA] (engine 0.0)     |
  *      | guc_mmio_reg[countB] (engine 0.1)     |
@@ -47,6 +49,7 @@ struct __guc_ads_blob {
 	struct guc_ads ads;
 	struct guc_policies policies;
 	struct guc_gt_system_info system_info;
+	struct guc_engine_usage engine_usage;
 	/* From here on, location is dynamic! Refer to above diagram. */
 	struct guc_mmio_reg regset[0];
 } __packed;
@@ -628,3 +631,21 @@ void intel_guc_ads_reset(struct intel_guc *guc)
 
 	guc_ads_private_data_reset(guc);
 }
+
+u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
+{
+	struct __guc_ads_blob *blob = guc->ads_blob;
+	u32 base = intel_guc_ggtt_offset(guc, guc->ads_vma);
+	u32 offset = base + ptr_offset(blob, engine_usage);
+
+	return offset;
+}
+
+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine)
+{
+	struct intel_guc *guc = &engine->gt->uc.guc;
+	struct __guc_ads_blob *blob = guc->ads_blob;
+	u8 guc_class = engine_class_to_guc_class(engine->class);
+
+	return &blob->engine_usage.engines[guc_class][engine->instance];
+}
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
index 3d85051d57e4..e74c110facff 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
@@ -6,8 +6,11 @@
 #ifndef _INTEL_GUC_ADS_H_
 #define _INTEL_GUC_ADS_H_
 
+#include <linux/types.h>
+
 struct intel_guc;
 struct drm_printer;
+struct intel_engine_cs;
 
 int intel_guc_ads_create(struct intel_guc *guc);
 void intel_guc_ads_destroy(struct intel_guc *guc);
@@ -15,5 +18,7 @@ void intel_guc_ads_init_late(struct intel_guc *guc);
 void intel_guc_ads_reset(struct intel_guc *guc);
 void intel_guc_ads_print_policy_info(struct intel_guc *guc,
 				     struct drm_printer *p);
+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine);
+u32 intel_guc_engine_usage_offset(struct intel_guc *guc);
 
 #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
index fa4be13c8854..7c9c081670fc 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
@@ -294,6 +294,19 @@ struct guc_ads {
 	u32 reserved[15];
 } __packed;
 
+/* Engine usage stats */
+struct guc_engine_usage_record {
+	u32 current_context_index;
+	u32 last_switch_in_stamp;
+	u32 reserved0;
+	u32 total_runtime;
+	u32 reserved1[4];
+} __packed;
+
+struct guc_engine_usage {
+	struct guc_engine_usage_record engines[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
+} __packed;
+
 /* GuC logging structures */
 
 enum guc_log_buffer_type {
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index ba0de35f6323..0c2e4d8d8ec3 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -12,6 +12,7 @@
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_clock_utils.h"
 #include "gt/intel_gt_irq.h"
 #include "gt/intel_gt_pm.h"
 #include "gt/intel_gt_requests.h"
@@ -20,6 +21,7 @@
 #include "gt/intel_mocs.h"
 #include "gt/intel_ring.h"
 
+#include "intel_guc_ads.h"
 #include "intel_guc_submission.h"
 
 #include "i915_drv.h"
@@ -750,6 +752,262 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
 	xa_unlock_irqrestore(&guc->context_lookup, flags);
 }
 
+/*
+ * GuC stores busyness stats for each engine at context in/out boundaries. A
+ * context 'in' logs execution start time, 'out' adds in -> out delta to total.
+ * i915/kmd accesses 'start', 'total' and 'context id' from memory shared with
+ * GuC.
+ *
+ * __i915_pmu_event_read samples engine busyness. When sampling, if context id
+ * is valid (!= ~0) and start is non-zero, the engine is considered to be
+ * active. For an active engine total busyness = total + (now - start), where
+ * 'now' is the time at which the busyness is sampled. For inactive engine,
+ * total busyness = total.
+ *
+ * All times are captured from GUCPMTIMESTAMP reg and are in gt clock domain.
+ *
+ * The start and total values provided by GuC are 32 bits and wrap around in a
+ * few minutes. Since perf pmu provides busyness as 64 bit monotonically
+ * increasing ns values, there is a need for this implementation to account for
+ * overflows and extend the GuC provided values to 64 bits before returning
+ * busyness to the user. In order to do that, a worker runs periodically at
+ * frequency = 1/8th the time it takes for the timestamp to wrap (i.e. once in
+ * 27 seconds for a gt clock frequency of 19.2 MHz).
+ */
+
+#define WRAP_TIME_CLKS U32_MAX
+#define POLL_TIME_CLKS (WRAP_TIME_CLKS >> 3)
+
+static void
+__extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start)
+{
+	u32 gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
+	u32 gt_stamp_last = lower_32_bits(guc->timestamp.gt_stamp);
+
+	if (new_start == lower_32_bits(*prev_start))
+		return;
+
+	if (new_start < gt_stamp_last &&
+	    (new_start - gt_stamp_last) <= POLL_TIME_CLKS)
+		gt_stamp_hi++;
+
+	if (new_start > gt_stamp_last &&
+	    (gt_stamp_last - new_start) <= POLL_TIME_CLKS && gt_stamp_hi)
+		gt_stamp_hi--;
+
+	*prev_start = ((u64)gt_stamp_hi << 32) | new_start;
+}
+
+static void guc_update_engine_gt_clks(struct intel_engine_cs *engine)
+{
+	struct guc_engine_usage_record *rec = intel_guc_engine_usage(engine);
+	struct intel_engine_guc_stats *stats = &engine->stats.guc;
+	struct intel_guc *guc = &engine->gt->uc.guc;
+	u32 last_switch = rec->last_switch_in_stamp;
+	u32 ctx_id = rec->current_context_index;
+	u32 total = rec->total_runtime;
+
+	lockdep_assert_held(&guc->timestamp.lock);
+
+	stats->running = ctx_id != ~0U && last_switch;
+	if (stats->running)
+		__extend_last_switch(guc, &stats->start_gt_clk, last_switch);
+
+	/*
+	 * Instead of adjusting the total for overflow, just add the
+	 * difference from previous sample stats->total_gt_clks
+	 */
+	if (total && total != ~0U) {
+		stats->total_gt_clks += (u32)(total - stats->prev_total);
+		stats->prev_total = total;
+	}
+}
+
+static void guc_update_pm_timestamp(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	u32 gt_stamp_now, gt_stamp_hi;
+
+	lockdep_assert_held(&guc->timestamp.lock);
+
+	gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
+	gt_stamp_now = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
+
+	if (gt_stamp_now < lower_32_bits(guc->timestamp.gt_stamp))
+		gt_stamp_hi++;
+
+	guc->timestamp.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_now;
+}
+
+/*
+ * Unlike the execlist mode of submission total and active times are in terms of
+ * gt clocks. The *now parameter is retained to return the cpu time at which the
+ * busyness was sampled.
+ */
+static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
+{
+	struct intel_engine_guc_stats *stats = &engine->stats.guc;
+	struct intel_gt *gt = engine->gt;
+	struct intel_guc *guc = &gt->uc.guc;
+	unsigned long flags;
+	bool reset_in_progress;
+	u64 total;
+	int srcu;
+
+	/*
+	 * If a reset is in progress, we risk reading partially updated
+	 * engine busyness from GuC, so we just use the driver stored
+	 * copy of busyness. Synchronize with gt reset lock to achieve
+	 * this.
+	 */
+	reset_in_progress = intel_gt_reset_trylock_no_wait(gt, &srcu);
+
+	/* 
+	 * The order of taking the reset lock first and then the
+	 * timestamp lock is intentional to avoid lock inversion related
+	 * issues.
+	 */
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	*now = ktime_get();
+
+	/*
+	 * The active busyness depends on start_gt_clk and gt_stamp.
+	 * gt_stamp is updated by i915 only when gt is awake and the
+	 * start_gt_clk is derived from GuC state. To get a consistent
+	 * view of activity, we query the GuC state only if gt is awake.
+	 */
+	if (intel_gt_pm_get_if_awake(gt) && !reset_in_progress) {
+		guc_update_engine_gt_clks(engine);
+		guc_update_pm_timestamp(guc);
+		intel_gt_pm_put_async(gt);
+	}
+
+	total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks);
+	if (stats->running) {
+		u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk;
+
+		total += intel_gt_clock_interval_to_ns(gt, clk);
+	}
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+	if (!reset_in_progress)
+		intel_gt_reset_unlock(gt, srcu);
+
+	return ns_to_ktime(total);
+}
+
+static void __reset_guc_busyness_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	unsigned long flags;
+
+	cancel_delayed_work_sync(&guc->timestamp.work);
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	guc_update_pm_timestamp(guc);
+	for_each_engine(engine, gt, id) {
+		guc_update_engine_gt_clks(engine);
+		engine->stats.guc.prev_total = 0;
+	}
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
+static void __update_guc_busyness_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	guc_update_pm_timestamp(guc);
+	for_each_engine(engine, gt, id)
+		guc_update_engine_gt_clks(engine);
+}
+
+static void guc_timestamp_ping(struct work_struct *wrk)
+{
+	struct intel_guc *guc = container_of(wrk, typeof(*guc),
+					     timestamp.work.work);
+	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
+	struct intel_gt *gt = guc_to_gt(guc);
+	intel_wakeref_t wakeref;
+	unsigned long flags;
+	int srcu, ret;
+
+	/*
+	 * Synchronize with gt reset to make sure the worker does not
+	 * corrupt the engine/guc stats.
+	 */
+	ret = intel_gt_reset_trylock(gt, &srcu);
+	if (ret)
+		return;
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
+		__update_guc_busyness_stats(guc);
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+
+	intel_gt_reset_unlock(gt, srcu);
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+}
+
+static int guc_action_enable_usage_stats(struct intel_guc *guc)
+{
+	u32 offset = intel_guc_engine_usage_offset(guc);
+	u32 action[] = {
+		INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF,
+		offset,
+		0,
+	};
+
+	return intel_guc_send(guc, action, ARRAY_SIZE(action));
+}
+
+static void guc_init_engine_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	intel_wakeref_t wakeref;
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+
+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref) {
+		int ret = guc_action_enable_usage_stats(guc);
+
+		if (ret)
+			drm_err(&gt->i915->drm,
+				"Failed to enable usage stats: %d!\n", ret);
+	}
+}
+
+void intel_guc_busyness_park(struct intel_gt *gt)
+{
+	struct intel_guc *guc = &gt->uc.guc;
+	unsigned long flags;
+
+	cancel_delayed_work(&guc->timestamp.work);
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+	__update_guc_busyness_stats(guc);
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
+void intel_guc_busyness_unpark(struct intel_gt *gt)
+{
+	struct intel_guc *guc = &gt->uc.guc;
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+}
+
 static inline bool
 submission_disabled(struct intel_guc *guc)
 {
@@ -809,6 +1067,7 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc)
 	intel_gt_park_heartbeats(guc_to_gt(guc));
 	disable_submission(guc);
 	guc->interrupts.disable(guc);
+	__reset_guc_busyness_stats(guc);
 
 	/* Flush IRQ handler */
 	spin_lock_irq(&guc_to_gt(guc)->irq_lock);
@@ -1132,6 +1391,7 @@ void intel_guc_submission_reset_finish(struct intel_guc *guc)
  */
 int intel_guc_submission_init(struct intel_guc *guc)
 {
+	struct intel_gt *gt = guc_to_gt(guc);
 	int ret;
 
 	if (guc->lrc_desc_pool)
@@ -1152,6 +1412,10 @@ int intel_guc_submission_init(struct intel_guc *guc)
 	INIT_LIST_HEAD(&guc->guc_id_list);
 	ida_init(&guc->guc_ids);
 
+	spin_lock_init(&guc->timestamp.lock);
+	INIT_DELAYED_WORK(&guc->timestamp.work, guc_timestamp_ping);
+	guc->timestamp.ping_delay = (POLL_TIME_CLKS / gt->clock_frequency + 1) * HZ;
+
 	return 0;
 }
 
@@ -2606,7 +2870,9 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
 		engine->emit_flush = gen12_emit_flush_xcs;
 	}
 	engine->set_default_submission = guc_set_default_submission;
+	engine->busyness = guc_engine_busyness;
 
+	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
 	engine->flags |= I915_ENGINE_HAS_PREEMPTION;
 	engine->flags |= I915_ENGINE_HAS_TIMESLICES;
 
@@ -2705,6 +2971,7 @@ int intel_guc_submission_setup(struct intel_engine_cs *engine)
 void intel_guc_submission_enable(struct intel_guc *guc)
 {
 	guc_init_lrc_mapping(guc);
+	guc_init_engine_stats(guc);
 }
 
 void intel_guc_submission_disable(struct intel_guc *guc)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
index c7ef44fa0c36..5a95a9f0a8e3 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
@@ -28,6 +28,8 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
 void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
 				    struct i915_request *hung_rq,
 				    struct drm_printer *m);
+void intel_guc_busyness_park(struct intel_gt *gt);
+void intel_guc_busyness_unpark(struct intel_gt *gt);
 
 bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
 
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index a897f4abea0c..9aee08425382 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -2664,6 +2664,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   RING_WAIT		(1 << 11) /* gen3+, PRBx_CTL */
 #define   RING_WAIT_SEMAPHORE	(1 << 10) /* gen6+ */
 
+#define GUCPMTIMESTAMP          _MMIO(0xC3E8)
+
 /* There are 16 64-bit CS General Purpose Registers per-engine on Gen8+ */
 #define GEN8_RING_CS_GPR(base, n)	_MMIO((base) + 0x600 + (n) * 8)
 #define GEN8_RING_CS_GPR_UDW(base, n)	_MMIO((base) + 0x600 + (n) * 8 + 4)
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-11 20:08     ` Umesh Nerlige Ramappa
@ 2021-10-12  8:26       ` Tvrtko Ursulin
  0 siblings, 0 replies; 31+ messages in thread
From: Tvrtko Ursulin @ 2021-10-12  8:26 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa
  Cc: intel-gfx, dri-devel, john.c.harrison, daniel.vetter, Matthew Brost


On 11/10/2021 21:08, Umesh Nerlige Ramappa wrote:
> On Mon, Oct 11, 2021 at 12:41:19PM +0100, Tvrtko Ursulin wrote:
>>
>> On 07/10/2021 23:55, Umesh Nerlige Ramappa wrote:
>>> With GuC handling scheduling, i915 is not aware of the time that a
>>> context is scheduled in and out of the engine. Since i915 pmu relies on
>>> this info to provide engine busyness to the user, GuC shares this info
>>> with i915 for all engines using shared memory. For each engine, this
>>> info contains:
>>>
>>> - total busyness: total time that the context was running (total)
>>> - id: id of the running context (id)
>>> - start timestamp: timestamp when the context started running (start)
>>>
>>> At the time (now) of sampling the engine busyness, if the id is valid
>>> (!= ~0), and start is non-zero, then the context is considered to be
>>> active and the engine busyness is calculated using the below equation
>>>
>>>     engine busyness = total + (now - start)
>>>
>>> All times are obtained from the gt clock base. For inactive contexts,
>>> engine busyness is just equal to the total.
>>>
>>> The start and total values provided by GuC are 32 bits and wrap around
>>> in a few minutes. Since perf pmu provides busyness as 64 bit
>>> monotonically increasing values, there is a need for this implementation
>>> to account for overflows and extend the time to 64 bits before returning
>>> busyness to the user. In order to do that, a worker runs periodically at
>>> frequency = 1/8th the time it takes for the timestamp to wrap. As an
>>> example, that would be once in 27 seconds for a gt clock frequency of
>>> 19.2 MHz.
>>>
>>> Note:
>>> There might be an overaccounting of busyness due to the fact that GuC
>>> may be updating the total and start values while kmd is reading them.
>>> (i.e kmd may read the updated total and the stale start). In such a
>>> case, user may see higher busyness value followed by smaller ones which
>>> would eventually catch up to the higher value.
>>>
>>> v2: (Tvrtko)
>>> - Include details in commit message
>>> - Move intel engine busyness function into execlist code
>>> - Use union inside engine->stats
>>> - Use natural type for ping delay jiffies
>>> - Drop active_work condition checks
>>> - Use for_each_engine if iterating all engines
>>> - Drop seq locking, use spinlock at guc level to update engine stats
>>> - Document worker specific details
>>>
>>> v3: (Tvrtko/Umesh)
>>> - Demarcate guc and execlist stat objects with comments
>>> - Document known over-accounting issue in commit
>>> - Provide a consistent view of guc state
>>> - Add hooks to gt park/unpark for guc busyness
>>> - Stop/start worker in gt park/unpark path
>>> - Drop inline
>>> - Move spinlock and worker inits to guc initialization
>>> - Drop helpers that are called only once
>>>
>>> v4: (Tvrtko/Matt/Umesh)
>>> - Drop addressed opens from commit message
>>> - Get runtime pm in ping, remove from the park path
>>> - Use cancel_delayed_work_sync in disable_submission path
>>> - Update stats during reset prepare
>>> - Skip ping if reset in progress
>>> - Explicitly name execlists and guc stats objects
>>> - Since disable_submission is called from many places, move resetting
>>>   stats to intel_guc_submission_reset_prepare
>>>
>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>> ---
>>>  drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  28 +--
>>>  drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
>>>  .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
>>>  drivers/gpu/drm/i915/gt/intel_gt_pm.c         |   2 +
>>>  .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
>>>  drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  26 ++
>>>  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  21 ++
>>>  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   5 +
>>>  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
>>>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 238 ++++++++++++++++++
>>>  .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
>>>  drivers/gpu/drm/i915/i915_reg.h               |   2 +
>>>  12 files changed, 377 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
>>> b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>> index 38436f4b5706..6b783fdcba2a 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>> @@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs 
>>> *engine,
>>>      intel_engine_print_breadcrumbs(engine, m);
>>>  }
>>> -static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs 
>>> *engine,
>>> -                        ktime_t *now)
>>> -{
>>> -    struct intel_engine_execlists_stats *stats = 
>>> &engine->stats.execlists;
>>> -    ktime_t total = stats->total;
>>> -
>>> -    /*
>>> -     * If the engine is executing something at the moment
>>> -     * add it to the total.
>>> -     */
>>> -    *now = ktime_get();
>>> -    if (READ_ONCE(stats->active))
>>> -        total = ktime_add(total, ktime_sub(*now, stats->start));
>>> -
>>> -    return total;
>>> -}
>>> -
>>>  /**
>>>   * intel_engine_get_busy_time() - Return current accumulated engine 
>>> busyness
>>>   * @engine: engine to report on
>>> @@ -1899,16 +1882,7 @@ static ktime_t 
>>> __intel_engine_get_busy_time(struct intel_engine_cs *engine,
>>>   */
>>>  ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, 
>>> ktime_t *now)
>>>  {
>>> -    struct intel_engine_execlists_stats *stats = 
>>> &engine->stats.execlists;
>>> -    unsigned int seq;
>>> -    ktime_t total;
>>> -
>>> -    do {
>>> -        seq = read_seqcount_begin(&stats->lock);
>>> -        total = __intel_engine_get_busy_time(engine, now);
>>> -    } while (read_seqcount_retry(&stats->lock, seq));
>>> -
>>> -    return total;
>>> +    return engine->busyness(engine, now);
>>>  }
>>>  struct intel_context *
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h 
>>> b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> index 316d8551d22f..4eb09d07419a 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> @@ -284,6 +284,28 @@ struct intel_engine_execlists_stats {
>>>      ktime_t start;
>>>  };
>>> +struct intel_engine_guc_stats {
>>> +    /**
>>> +     * @running: Active state of the engine when busyness was last 
>>> sampled.
>>> +     */
>>> +    bool running;
>>> +
>>> +    /**
>>> +     * @prev_total: Previous value of total runtime clock cycles.
>>> +     */
>>> +    u32 prev_total;
>>> +
>>> +    /**
>>> +     * @total_gt_clks: Total gt clock cycles this engine was busy.
>>> +     */
>>> +    u64 total_gt_clks;
>>> +
>>> +    /**
>>> +     * @start_gt_clk: GT clock time of last idle to active transition.
>>> +     */
>>> +    u64 start_gt_clk;
>>> +};
>>> +
>>>  struct intel_engine_cs {
>>>      struct drm_i915_private *i915;
>>>      struct intel_gt *gt;
>>> @@ -459,6 +481,12 @@ struct intel_engine_cs {
>>>      void        (*add_active_request)(struct i915_request *rq);
>>>      void        (*remove_active_request)(struct i915_request *rq);
>>> +    /*
>>> +     * Get engine busyness and the time at which the busyness was 
>>> sampled.
>>> +     */
>>> +    ktime_t        (*busyness)(struct intel_engine_cs *engine,
>>> +                    ktime_t *now);
>>> +
>>>      struct intel_engine_execlists execlists;
>>>      /*
>>> @@ -508,7 +536,10 @@ struct intel_engine_cs {
>>>      u32 (*get_cmd_length_mask)(u32 cmd_header);
>>>      struct {
>>> -        struct intel_engine_execlists_stats execlists;
>>> +        union {
>>> +            struct intel_engine_execlists_stats execlists;
>>> +            struct intel_engine_guc_stats guc;
>>> +        };
>>>          /**
>>>           * @rps: Utilisation at last RPS sampling.
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
>>> b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> index 7147fe80919e..6bece961eeb1 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> @@ -3292,6 +3292,38 @@ static void execlists_release(struct 
>>> intel_engine_cs *engine)
>>>      lrc_fini_wa_ctx(engine);
>>>  }
>>> +static ktime_t __execlists_engine_busyness(struct intel_engine_cs 
>>> *engine,
>>> +                       ktime_t *now)
>>> +{
>>> +    struct intel_engine_execlists_stats *stats = 
>>> &engine->stats.execlists;
>>> +    ktime_t total = stats->total;
>>> +
>>> +    /*
>>> +     * If the engine is executing something at the moment
>>> +     * add it to the total.
>>> +     */
>>> +    *now = ktime_get();
>>> +    if (READ_ONCE(stats->active))
>>> +        total = ktime_add(total, ktime_sub(*now, stats->start));
>>> +
>>> +    return total;
>>> +}
>>> +
>>> +static ktime_t execlists_engine_busyness(struct intel_engine_cs 
>>> *engine,
>>> +                     ktime_t *now)
>>> +{
>>> +    struct intel_engine_execlists_stats *stats = 
>>> &engine->stats.execlists;
>>> +    unsigned int seq;
>>> +    ktime_t total;
>>> +
>>> +    do {
>>> +        seq = read_seqcount_begin(&stats->lock);
>>> +        total = __execlists_engine_busyness(engine, now);
>>> +    } while (read_seqcount_retry(&stats->lock, seq));
>>> +
>>> +    return total;
>>> +}
>>> +
>>>  static void
>>>  logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>>>  {
>>> @@ -3348,6 +3380,8 @@ logical_ring_default_vfuncs(struct 
>>> intel_engine_cs *engine)
>>>          engine->emit_bb_start = gen8_emit_bb_start;
>>>      else
>>>          engine->emit_bb_start = gen8_emit_bb_start_noarb;
>>> +
>>> +    engine->busyness = execlists_engine_busyness;
>>>  }
>>>  static void logical_ring_default_irqs(struct intel_engine_cs *engine)
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c 
>>> b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
>>> index 524eaf678790..b4a8594bc46c 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
>>> @@ -86,6 +86,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
>>>      intel_rc6_unpark(&gt->rc6);
>>>      intel_rps_unpark(&gt->rps);
>>>      i915_pmu_gt_unparked(i915);
>>> +    intel_guc_busyness_unpark(gt);
>>>      intel_gt_unpark_requests(gt);
>>>      runtime_begin(gt);
>>> @@ -104,6 +105,7 @@ static int __gt_park(struct intel_wakeref *wf)
>>>      runtime_end(gt);
>>>      intel_gt_park_requests(gt);
>>> +    intel_guc_busyness_park(gt);
>>>      i915_vma_parked(gt);
>>>      i915_pmu_gt_parked(i915);
>>>      intel_rps_park(&gt->rps);
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h 
>>> b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>>> index 8ff582222aff..ff1311d4beff 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>>> +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>>> @@ -143,6 +143,7 @@ enum intel_guc_action {
>>>      INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506,
>>>      INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
>>>      INTEL_GUC_ACTION_RESET_CLIENT = 0x5507,
>>> +    INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
>>>      INTEL_GUC_ACTION_LIMIT
>>>  };
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h 
>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>> index 5dd174babf7a..22c30dbdf63a 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>> @@ -104,6 +104,8 @@ struct intel_guc {
>>>      u32 ads_regset_size;
>>>      /** @ads_golden_ctxt_size: size of the golden contexts in the 
>>> ADS */
>>>      u32 ads_golden_ctxt_size;
>>> +    /** @ads_engine_usage_size: size of engine usage in the ADS */
>>> +    u32 ads_engine_usage_size;
>>>      /** @lrc_desc_pool: object allocated to hold the GuC LRC 
>>> descriptor pool */
>>>      struct i915_vma *lrc_desc_pool;
>>> @@ -138,6 +140,30 @@ struct intel_guc {
>>>      /** @send_mutex: used to serialize the intel_guc_send actions */
>>>      struct mutex send_mutex;
>>> +
>>> +    struct {
>>> +        /**
>>> +         * @lock: Lock protecting the below fields and the engine 
>>> stats.
>>> +         */
>>> +        spinlock_t lock;
>>> +
>>> +        /**
>>> +         * @gt_stamp: 64 bit extended value of the GT timestamp.
>>> +         */
>>> +        u64 gt_stamp;
>>> +
>>> +        /**
>>> +         * @ping_delay: Period for polling the GT timestamp for
>>> +         * overflow.
>>> +         */
>>> +        unsigned long ping_delay;
>>> +
>>> +        /**
>>> +         * @work: Periodic work to adjust GT timestamp, engine and
>>> +         * context usage for overflows.
>>> +         */
>>> +        struct delayed_work work;
>>> +    } timestamp;
>>>  };
>>>  static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c 
>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>>> index 2c6ea64af7ec..ca9ab53999d5 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>>> @@ -26,6 +26,8 @@
>>>   *      | guc_policies                          |
>>>   *      +---------------------------------------+
>>>   *      | guc_gt_system_info                    |
>>> + *      +---------------------------------------+
>>> + *      | guc_engine_usage                      |
>>>   *      +---------------------------------------+ <== static
>>>   *      | guc_mmio_reg[countA] (engine 0.0)     |
>>>   *      | guc_mmio_reg[countB] (engine 0.1)     |
>>> @@ -47,6 +49,7 @@ struct __guc_ads_blob {
>>>      struct guc_ads ads;
>>>      struct guc_policies policies;
>>>      struct guc_gt_system_info system_info;
>>> +    struct guc_engine_usage engine_usage;
>>>      /* From here on, location is dynamic! Refer to above diagram. */
>>>      struct guc_mmio_reg regset[0];
>>>  } __packed;
>>> @@ -628,3 +631,21 @@ void intel_guc_ads_reset(struct intel_guc *guc)
>>>      guc_ads_private_data_reset(guc);
>>>  }
>>> +
>>> +u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
>>> +{
>>> +    struct __guc_ads_blob *blob = guc->ads_blob;
>>> +    u32 base = intel_guc_ggtt_offset(guc, guc->ads_vma);
>>> +    u32 offset = base + ptr_offset(blob, engine_usage);
>>> +
>>> +    return offset;
>>> +}
>>> +
>>> +struct guc_engine_usage_record *intel_guc_engine_usage(struct 
>>> intel_engine_cs *engine)
>>> +{
>>> +    struct intel_guc *guc = &engine->gt->uc.guc;
>>> +    struct __guc_ads_blob *blob = guc->ads_blob;
>>> +    u8 guc_class = engine_class_to_guc_class(engine->class);
>>> +
>>> +    return &blob->engine_usage.engines[guc_class][engine->instance];
>>> +}
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h 
>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
>>> index 3d85051d57e4..e74c110facff 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
>>> @@ -6,8 +6,11 @@
>>>  #ifndef _INTEL_GUC_ADS_H_
>>>  #define _INTEL_GUC_ADS_H_
>>> +#include <linux/types.h>
>>> +
>>>  struct intel_guc;
>>>  struct drm_printer;
>>> +struct intel_engine_cs;
>>>  int intel_guc_ads_create(struct intel_guc *guc);
>>>  void intel_guc_ads_destroy(struct intel_guc *guc);
>>> @@ -15,5 +18,7 @@ void intel_guc_ads_init_late(struct intel_guc *guc);
>>>  void intel_guc_ads_reset(struct intel_guc *guc);
>>>  void intel_guc_ads_print_policy_info(struct intel_guc *guc,
>>>                       struct drm_printer *p);
>>> +struct guc_engine_usage_record *intel_guc_engine_usage(struct 
>>> intel_engine_cs *engine);
>>> +u32 intel_guc_engine_usage_offset(struct intel_guc *guc);
>>>  #endif
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h 
>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>> index fa4be13c8854..7c9c081670fc 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>> @@ -294,6 +294,19 @@ struct guc_ads {
>>>      u32 reserved[15];
>>>  } __packed;
>>> +/* Engine usage stats */
>>> +struct guc_engine_usage_record {
>>> +    u32 current_context_index;
>>> +    u32 last_switch_in_stamp;
>>> +    u32 reserved0;
>>> +    u32 total_runtime;
>>> +    u32 reserved1[4];
>>> +} __packed;
>>> +
>>> +struct guc_engine_usage {
>>> +    struct guc_engine_usage_record 
>>> engines[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
>>> +} __packed;
>>> +
>>>  /* GuC logging structures */
>>>  enum guc_log_buffer_type {
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> index ba0de35f6323..f0c27ae2cecc 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> @@ -12,6 +12,7 @@
>>>  #include "gt/intel_engine_pm.h"
>>>  #include "gt/intel_engine_heartbeat.h"
>>>  #include "gt/intel_gt.h"
>>> +#include "gt/intel_gt_clock_utils.h"
>>>  #include "gt/intel_gt_irq.h"
>>>  #include "gt/intel_gt_pm.h"
>>>  #include "gt/intel_gt_requests.h"
>>> @@ -20,6 +21,7 @@
>>>  #include "gt/intel_mocs.h"
>>>  #include "gt/intel_ring.h"
>>> +#include "intel_guc_ads.h"
>>>  #include "intel_guc_submission.h"
>>>  #include "i915_drv.h"
>>> @@ -750,6 +752,233 @@ static void 
>>> scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
>>>      xa_unlock_irqrestore(&guc->context_lookup, flags);
>>>  }
>>> +/*
>>> + * GuC stores busyness stats for each engine at context in/out 
>>> boundaries. A
>>> + * context 'in' logs execution start time, 'out' adds in -> out 
>>> delta to total.
>>> + * i915/kmd accesses 'start', 'total' and 'context id' from memory 
>>> shared with
>>> + * GuC.
>>> + *
>>> + * __i915_pmu_event_read samples engine busyness. When sampling, if 
>>> context id
>>> + * is valid (!= ~0) and start is non-zero, the engine is considered 
>>> to be
>>> + * active. For an active engine total busyness = total + (now - 
>>> start), where
>>> + * 'now' is the time at which the busyness is sampled. For inactive 
>>> engine,
>>> + * total busyness = total.
>>> + *
>>> + * All times are captured from GUCPMTIMESTAMP reg and are in gt 
>>> clock domain.
>>> + *
>>> + * The start and total values provided by GuC are 32 bits and wrap 
>>> around in a
>>> + * few minutes. Since perf pmu provides busyness as 64 bit 
>>> monotonically
>>> + * increasing ns values, there is a need for this implementation to 
>>> account for
>>> + * overflows and extend the GuC provided values to 64 bits before 
>>> returning
>>> + * busyness to the user. In order to do that, a worker runs 
>>> periodically at
>>> + * frequency = 1/8th the time it takes for the timestamp to wrap 
>>> (i.e. once in
>>> + * 27 seconds for a gt clock frequency of 19.2 MHz).
>>> + */
>>> +
>>> +#define WRAP_TIME_CLKS U32_MAX
>>> +#define POLL_TIME_CLKS (WRAP_TIME_CLKS >> 3)
>>> +
>>> +static void
>>> +__extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 
>>> new_start)
>>> +{
>>> +    u32 gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
>>> +    u32 gt_stamp_last = lower_32_bits(guc->timestamp.gt_stamp);
>>> +
>>> +    if (new_start == lower_32_bits(*prev_start))
>>> +        return;
>>> +
>>> +    if (new_start < gt_stamp_last &&
>>> +        (new_start - gt_stamp_last) <= POLL_TIME_CLKS)
>>> +        gt_stamp_hi++;
>>> +
>>> +    if (new_start > gt_stamp_last &&
>>> +        (gt_stamp_last - new_start) <= POLL_TIME_CLKS && gt_stamp_hi)
>>> +        gt_stamp_hi--;
>>> +
>>> +    *prev_start = ((u64)gt_stamp_hi << 32) | new_start;
>>> +}
>>> +
>>> +static void guc_update_engine_gt_clks(struct intel_engine_cs *engine)
>>> +{
>>> +    struct guc_engine_usage_record *rec = 
>>> intel_guc_engine_usage(engine);
>>> +    struct intel_engine_guc_stats *stats = &engine->stats.guc;
>>> +    struct intel_guc *guc = &engine->gt->uc.guc;
>>> +    u32 last_switch = rec->last_switch_in_stamp;
>>> +    u32 ctx_id = rec->current_context_index;
>>> +    u32 total = rec->total_runtime;
>>> +
>>> +    lockdep_assert_held(&guc->timestamp.lock);
>>> +
>>> +    stats->running = ctx_id != ~0U && last_switch;
>>> +    if (stats->running)
>>> +        __extend_last_switch(guc, &stats->start_gt_clk, last_switch);
>>> +
>>> +    /*
>>> +     * Instead of adjusting the total for overflow, just add the
>>> +     * difference from previous sample stats->total_gt_clks
>>> +     */
>>> +    if (total && total != ~0U) {
>>> +        stats->total_gt_clks += (u32)(total - stats->prev_total);
>>> +        stats->prev_total = total;
>>> +    }
>>> +}
>>> +
>>> +static void guc_update_pm_timestamp(struct intel_guc *guc)
>>> +{
>>> +    struct intel_gt *gt = guc_to_gt(guc);
>>> +    u32 gt_stamp_now, gt_stamp_hi;
>>> +
>>> +    lockdep_assert_held(&guc->timestamp.lock);
>>> +
>>> +    gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
>>> +    gt_stamp_now = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
>>> +
>>> +    if (gt_stamp_now < lower_32_bits(guc->timestamp.gt_stamp))
>>> +        gt_stamp_hi++;
>>> +
>>> +    guc->timestamp.gt_stamp = ((u64) gt_stamp_hi << 32) | gt_stamp_now;
>>> +}
>>> +
>>> +/*
>>> + * Unlike the execlist mode of submission total and active times are 
>>> in terms of
>>> + * gt clocks. The *now parameter is retained to return the cpu time 
>>> at which the
>>> + * busyness was sampled.
>>> + */
>>> +static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, 
>>> ktime_t *now)
>>> +{
>>> +    struct intel_engine_guc_stats *stats = &engine->stats.guc;
>>> +    struct intel_gt *gt = engine->gt;
>>> +    struct intel_guc *guc = &gt->uc.guc;
>>> +    unsigned long flags;
>>> +    u64 total;
>>> +
>>> +    spin_lock_irqsave(&guc->timestamp.lock, flags);
>>> +
>>> +    *now = ktime_get();
>>> +
>>> +    /*
>>> +     * The active busyness depends on start_gt_clk and gt_stamp.
>>> +     * gt_stamp is updated by i915 only when gt is awake and the
>>> +     * start_gt_clk is derived from GuC state. To get a consistent
>>> +     * view of activity, we query the GuC state only if gt is awake.
>>> +     */
>>> +    if (intel_gt_pm_get_if_awake(gt)) {
>>> +        guc_update_engine_gt_clks(engine);
>>
>> Reset can happen at any point theoretically like here, right? Or...
>>
>>> +        guc_update_pm_timestamp(guc);
>>> +        intel_gt_pm_put_async(gt);
>>> +    }
>>> +
>>> +    total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks);
>>> +    if (stats->running) {
>>> +        u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk;
>>> +
>>> +        total += intel_gt_clock_interval_to_ns(gt, clk);
>>> +    }
>>> +
>>> +    spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>> +
>>> +    return ns_to_ktime(total);
>>> +}
>>> +
>>> +static void __reset_guc_busyness_stats(struct intel_guc *guc)
>>> +{
>>> +    struct intel_gt *gt = guc_to_gt(guc);
>>> +    struct intel_engine_cs *engine;
>>> +    enum intel_engine_id id;
>>> +    unsigned long flags;
>>> +
>>> +    cancel_delayed_work_sync(&guc->timestamp.work);
>>> +
>>> +    spin_lock_irqsave(&guc->timestamp.lock, flags);
>>> +
>>> +    guc_update_pm_timestamp(guc);
>>> +    for_each_engine(engine, gt, id) {
>>> +        guc_update_engine_gt_clks(engine);
>>> +        engine->stats.guc.prev_total = 0;
>>> +    }
>>> +
>>> +    spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>> +}
>>> +
>>> +static void __update_guc_busyness_stats(struct intel_guc *guc)
>>> +{
>>> +    struct intel_gt *gt = guc_to_gt(guc);
>>> +    struct intel_engine_cs *engine;
>>> +    enum intel_engine_id id;
>>> +    unsigned long flags;
>>> +
>>> +    spin_lock_irqsave(&guc->timestamp.lock, flags);
>>> +
>>> +    guc_update_pm_timestamp(guc);
>>> +    for_each_engine(engine, gt, id)
>>
>> ... even here when called from guc_timestamp_ping. Both cases would 
>> "corrupt" the saved state due potential to read partially clear data 
>> from the shared page?
>>
>> Looking around the code base it should be possible to use 
>> intel_gt_reset_trylock and intel_gt_reset_unlock from the worker, but 
>> from the PMU callback you can't sleep so you'd just need a new helper, 
>> like a /real/ trylock which just returns error if it fails to lock and 
>> then you treat it the same way as if you failed to get runtime pm ref. 
>> Does that make sense?
> 
> fwiu..
> 
> You are suggesting I use intel_gt_reset_trylock instead of 
> uc->reset_in_progress below. I thought flag would be sufficient.

I think you need a lock around the whole access to 
guc_engine_usage_record otherwise I don't see how it is sufficient. PMU 
callback and the worker run asynchronously to GPU activity so reset can 
happen, theoretically, right in the middle of the state being read.

> For PMU callback, why not just use the same uc->reset_in_progress? If 
> reset is in progress, we treat it like failure to get pm wakeref.
> 
> On the other hand, I don't mind adding intel_gt_reset_trylock to ping, 
> but not clear how the PMU callback will avoid sleeping because the reset 
> lock itself (gt->reset.backoff_srcu) is a sleepable rcu. Thinking 
> something like this...?
> 
> int intel_gt_reset_sleepless_trylock(struct intel_gt *gt, int *srcu)
> {
>      int reset_in_progress;
> 
>      might_lock(&gt->reset.backoff_srcu);
> 
>      rcu_read_lock();
>      reset_in_progress = test_bit(I915_RESET_BACKOFF, &gt->reset.flags);
>      *srcu = srcu_read_lock(&gt->reset.backoff_srcu);
>      rcu_read_unlock();
> 
>      return reset_in_progress;
> }
> 
> paired with intel_gt_reset_unlock().

Possibly. I am not really familiar with those code paths. But it appears 
it considers holding srcu_read_lock is enough to prevent resets 
happening, and it appers srcu_read_lock itself does not sleep so it 
looks plausible altogether.

Regards,

Tvrtko

> 
> Thanks,
> Umesh
> 
>>
>> Regards,
>>
>> Tvrtko
>>
>>
>>> +        guc_update_engine_gt_clks(engine);
>>> +
>>> +    spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>> +}
>>> +
>>> +static void guc_timestamp_ping(struct work_struct *wrk)
>>> +{
>>> +    struct intel_guc *guc = container_of(wrk, typeof(*guc),
>>> +                         timestamp.work.work);
>>> +    struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
>>> +    struct intel_gt *gt = guc_to_gt(guc);
>>> +    intel_wakeref_t wakeref;
>>> +
>>> +    if (uc->reset_in_progress)
>>> +        return;
>>> +
>>> +    with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>>> +        __update_guc_busyness_stats(guc);
>>> +
>>> +    mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>> +             guc->timestamp.ping_delay);
>>> +}
>>> +
>>> +static int guc_action_enable_usage_stats(struct intel_guc *guc)
>>> +{
>>> +    u32 offset = intel_guc_engine_usage_offset(guc);
>>> +    u32 action[] = {
>>> +        INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF,
>>> +        offset,
>>> +        0,
>>> +    };
>>> +
>>> +    return intel_guc_send(guc, action, ARRAY_SIZE(action));
>>> +}
>>> +
>>> +static void guc_init_engine_stats(struct intel_guc *guc)
>>> +{
>>> +    struct intel_gt *gt = guc_to_gt(guc);
>>> +    intel_wakeref_t wakeref;
>>> +
>>> +    mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>> +             guc->timestamp.ping_delay);
>>> +
>>> +    with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref) {
>>> +        int ret = guc_action_enable_usage_stats(guc);
>>> +
>>> +        if (ret)
>>> +            drm_err(&gt->i915->drm,
>>> +                "Failed to enable usage stats: %d!\n", ret);
>>> +    }
>>> +}
>>> +
>>> +void intel_guc_busyness_park(struct intel_gt *gt)
>>> +{
>>> +    struct intel_guc *guc = &gt->uc.guc;
>>> +
>>> +    cancel_delayed_work(&guc->timestamp.work);
>>> +    __update_guc_busyness_stats(guc);
>>> +}
>>> +
>>> +void intel_guc_busyness_unpark(struct intel_gt *gt)
>>> +{
>>> +    struct intel_guc *guc = &gt->uc.guc;
>>> +
>>> +    mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>> +             guc->timestamp.ping_delay);
>>> +}
>>> +
>>>  static inline bool
>>>  submission_disabled(struct intel_guc *guc)
>>>  {
>>> @@ -809,6 +1038,7 @@ void intel_guc_submission_reset_prepare(struct 
>>> intel_guc *guc)
>>>      intel_gt_park_heartbeats(guc_to_gt(guc));
>>>      disable_submission(guc);
>>>      guc->interrupts.disable(guc);
>>> +    __reset_guc_busyness_stats(guc);
>>>      /* Flush IRQ handler */
>>>      spin_lock_irq(&guc_to_gt(guc)->irq_lock);
>>> @@ -1132,6 +1362,7 @@ void intel_guc_submission_reset_finish(struct 
>>> intel_guc *guc)
>>>   */
>>>  int intel_guc_submission_init(struct intel_guc *guc)
>>>  {
>>> +    struct intel_gt *gt = guc_to_gt(guc);
>>>      int ret;
>>>      if (guc->lrc_desc_pool)
>>> @@ -1152,6 +1383,10 @@ int intel_guc_submission_init(struct intel_guc 
>>> *guc)
>>>      INIT_LIST_HEAD(&guc->guc_id_list);
>>>      ida_init(&guc->guc_ids);
>>> +    spin_lock_init(&guc->timestamp.lock);
>>> +    INIT_DELAYED_WORK(&guc->timestamp.work, guc_timestamp_ping);
>>> +    guc->timestamp.ping_delay = (POLL_TIME_CLKS / 
>>> gt->clock_frequency + 1) * HZ;
>>> +
>>>      return 0;
>>>  }
>>> @@ -2606,7 +2841,9 @@ static void guc_default_vfuncs(struct 
>>> intel_engine_cs *engine)
>>>          engine->emit_flush = gen12_emit_flush_xcs;
>>>      }
>>>      engine->set_default_submission = guc_set_default_submission;
>>> +    engine->busyness = guc_engine_busyness;
>>> +    engine->flags |= I915_ENGINE_SUPPORTS_STATS;
>>>      engine->flags |= I915_ENGINE_HAS_PREEMPTION;
>>>      engine->flags |= I915_ENGINE_HAS_TIMESLICES;
>>> @@ -2705,6 +2942,7 @@ int intel_guc_submission_setup(struct 
>>> intel_engine_cs *engine)
>>>  void intel_guc_submission_enable(struct intel_guc *guc)
>>>  {
>>>      guc_init_lrc_mapping(guc);
>>> +    guc_init_engine_stats(guc);
>>>  }
>>>  void intel_guc_submission_disable(struct intel_guc *guc)
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h 
>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
>>> index c7ef44fa0c36..5a95a9f0a8e3 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
>>> @@ -28,6 +28,8 @@ void intel_guc_submission_print_context_info(struct 
>>> intel_guc *guc,
>>>  void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
>>>                      struct i915_request *hung_rq,
>>>                      struct drm_printer *m);
>>> +void intel_guc_busyness_park(struct intel_gt *gt);
>>> +void intel_guc_busyness_unpark(struct intel_gt *gt);
>>>  bool intel_guc_virtual_engine_has_heartbeat(const struct 
>>> intel_engine_cs *ve);
>>> diff --git a/drivers/gpu/drm/i915/i915_reg.h 
>>> b/drivers/gpu/drm/i915/i915_reg.h
>>> index a897f4abea0c..9aee08425382 100644
>>> --- a/drivers/gpu/drm/i915/i915_reg.h
>>> +++ b/drivers/gpu/drm/i915/i915_reg.h
>>> @@ -2664,6 +2664,8 @@ static inline bool 
>>> i915_mmio_reg_valid(i915_reg_t reg)
>>>  #define   RING_WAIT        (1 << 11) /* gen3+, PRBx_CTL */
>>>  #define   RING_WAIT_SEMAPHORE    (1 << 10) /* gen6+ */
>>> +#define GUCPMTIMESTAMP          _MMIO(0xC3E8)
>>> +
>>>  /* There are 16 64-bit CS General Purpose Registers per-engine on 
>>> Gen8+ */
>>>  #define GEN8_RING_CS_GPR(base, n)    _MMIO((base) + 0x600 + (n) * 8)
>>>  #define GEN8_RING_CS_GPR_UDW(base, n)    _MMIO((base) + 0x600 + (n) 
>>> * 8 + 4)
>>>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-11 11:41   ` Tvrtko Ursulin
@ 2021-10-11 20:08     ` Umesh Nerlige Ramappa
  2021-10-12  8:26       ` Tvrtko Ursulin
  0 siblings, 1 reply; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-10-11 20:08 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: intel-gfx, dri-devel, john.c.harrison, daniel.vetter, Matthew Brost

On Mon, Oct 11, 2021 at 12:41:19PM +0100, Tvrtko Ursulin wrote:
>
>On 07/10/2021 23:55, Umesh Nerlige Ramappa wrote:
>>With GuC handling scheduling, i915 is not aware of the time that a
>>context is scheduled in and out of the engine. Since i915 pmu relies on
>>this info to provide engine busyness to the user, GuC shares this info
>>with i915 for all engines using shared memory. For each engine, this
>>info contains:
>>
>>- total busyness: total time that the context was running (total)
>>- id: id of the running context (id)
>>- start timestamp: timestamp when the context started running (start)
>>
>>At the time (now) of sampling the engine busyness, if the id is valid
>>(!= ~0), and start is non-zero, then the context is considered to be
>>active and the engine busyness is calculated using the below equation
>>
>>	engine busyness = total + (now - start)
>>
>>All times are obtained from the gt clock base. For inactive contexts,
>>engine busyness is just equal to the total.
>>
>>The start and total values provided by GuC are 32 bits and wrap around
>>in a few minutes. Since perf pmu provides busyness as 64 bit
>>monotonically increasing values, there is a need for this implementation
>>to account for overflows and extend the time to 64 bits before returning
>>busyness to the user. In order to do that, a worker runs periodically at
>>frequency = 1/8th the time it takes for the timestamp to wrap. As an
>>example, that would be once in 27 seconds for a gt clock frequency of
>>19.2 MHz.
>>
>>Note:
>>There might be an overaccounting of busyness due to the fact that GuC
>>may be updating the total and start values while kmd is reading them.
>>(i.e kmd may read the updated total and the stale start). In such a
>>case, user may see higher busyness value followed by smaller ones which
>>would eventually catch up to the higher value.
>>
>>v2: (Tvrtko)
>>- Include details in commit message
>>- Move intel engine busyness function into execlist code
>>- Use union inside engine->stats
>>- Use natural type for ping delay jiffies
>>- Drop active_work condition checks
>>- Use for_each_engine if iterating all engines
>>- Drop seq locking, use spinlock at guc level to update engine stats
>>- Document worker specific details
>>
>>v3: (Tvrtko/Umesh)
>>- Demarcate guc and execlist stat objects with comments
>>- Document known over-accounting issue in commit
>>- Provide a consistent view of guc state
>>- Add hooks to gt park/unpark for guc busyness
>>- Stop/start worker in gt park/unpark path
>>- Drop inline
>>- Move spinlock and worker inits to guc initialization
>>- Drop helpers that are called only once
>>
>>v4: (Tvrtko/Matt/Umesh)
>>- Drop addressed opens from commit message
>>- Get runtime pm in ping, remove from the park path
>>- Use cancel_delayed_work_sync in disable_submission path
>>- Update stats during reset prepare
>>- Skip ping if reset in progress
>>- Explicitly name execlists and guc stats objects
>>- Since disable_submission is called from many places, move resetting
>>   stats to intel_guc_submission_reset_prepare
>>
>>Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>---
>>  drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  28 +--
>>  drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
>>  .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
>>  drivers/gpu/drm/i915/gt/intel_gt_pm.c         |   2 +
>>  .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
>>  drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  26 ++
>>  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  21 ++
>>  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   5 +
>>  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
>>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 238 ++++++++++++++++++
>>  .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
>>  drivers/gpu/drm/i915/i915_reg.h               |   2 +
>>  12 files changed, 377 insertions(+), 28 deletions(-)
>>
>>diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>index 38436f4b5706..6b783fdcba2a 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>@@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>>  	intel_engine_print_breadcrumbs(engine, m);
>>  }
>>-static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
>>-					    ktime_t *now)
>>-{
>>-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
>>-	ktime_t total = stats->total;
>>-
>>-	/*
>>-	 * If the engine is executing something at the moment
>>-	 * add it to the total.
>>-	 */
>>-	*now = ktime_get();
>>-	if (READ_ONCE(stats->active))
>>-		total = ktime_add(total, ktime_sub(*now, stats->start));
>>-
>>-	return total;
>>-}
>>-
>>  /**
>>   * intel_engine_get_busy_time() - Return current accumulated engine busyness
>>   * @engine: engine to report on
>>@@ -1899,16 +1882,7 @@ static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
>>   */
>>  ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now)
>>  {
>>-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
>>-	unsigned int seq;
>>-	ktime_t total;
>>-
>>-	do {
>>-		seq = read_seqcount_begin(&stats->lock);
>>-		total = __intel_engine_get_busy_time(engine, now);
>>-	} while (read_seqcount_retry(&stats->lock, seq));
>>-
>>-	return total;
>>+	return engine->busyness(engine, now);
>>  }
>>  struct intel_context *
>>diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>index 316d8551d22f..4eb09d07419a 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>@@ -284,6 +284,28 @@ struct intel_engine_execlists_stats {
>>  	ktime_t start;
>>  };
>>+struct intel_engine_guc_stats {
>>+	/**
>>+	 * @running: Active state of the engine when busyness was last sampled.
>>+	 */
>>+	bool running;
>>+
>>+	/**
>>+	 * @prev_total: Previous value of total runtime clock cycles.
>>+	 */
>>+	u32 prev_total;
>>+
>>+	/**
>>+	 * @total_gt_clks: Total gt clock cycles this engine was busy.
>>+	 */
>>+	u64 total_gt_clks;
>>+
>>+	/**
>>+	 * @start_gt_clk: GT clock time of last idle to active transition.
>>+	 */
>>+	u64 start_gt_clk;
>>+};
>>+
>>  struct intel_engine_cs {
>>  	struct drm_i915_private *i915;
>>  	struct intel_gt *gt;
>>@@ -459,6 +481,12 @@ struct intel_engine_cs {
>>  	void		(*add_active_request)(struct i915_request *rq);
>>  	void		(*remove_active_request)(struct i915_request *rq);
>>+	/*
>>+	 * Get engine busyness and the time at which the busyness was sampled.
>>+	 */
>>+	ktime_t		(*busyness)(struct intel_engine_cs *engine,
>>+				    ktime_t *now);
>>+
>>  	struct intel_engine_execlists execlists;
>>  	/*
>>@@ -508,7 +536,10 @@ struct intel_engine_cs {
>>  	u32 (*get_cmd_length_mask)(u32 cmd_header);
>>  	struct {
>>-		struct intel_engine_execlists_stats execlists;
>>+		union {
>>+			struct intel_engine_execlists_stats execlists;
>>+			struct intel_engine_guc_stats guc;
>>+		};
>>  		/**
>>  		 * @rps: Utilisation at last RPS sampling.
>>diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>index 7147fe80919e..6bece961eeb1 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>@@ -3292,6 +3292,38 @@ static void execlists_release(struct intel_engine_cs *engine)
>>  	lrc_fini_wa_ctx(engine);
>>  }
>>+static ktime_t __execlists_engine_busyness(struct intel_engine_cs *engine,
>>+					   ktime_t *now)
>>+{
>>+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
>>+	ktime_t total = stats->total;
>>+
>>+	/*
>>+	 * If the engine is executing something at the moment
>>+	 * add it to the total.
>>+	 */
>>+	*now = ktime_get();
>>+	if (READ_ONCE(stats->active))
>>+		total = ktime_add(total, ktime_sub(*now, stats->start));
>>+
>>+	return total;
>>+}
>>+
>>+static ktime_t execlists_engine_busyness(struct intel_engine_cs *engine,
>>+					 ktime_t *now)
>>+{
>>+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
>>+	unsigned int seq;
>>+	ktime_t total;
>>+
>>+	do {
>>+		seq = read_seqcount_begin(&stats->lock);
>>+		total = __execlists_engine_busyness(engine, now);
>>+	} while (read_seqcount_retry(&stats->lock, seq));
>>+
>>+	return total;
>>+}
>>+
>>  static void
>>  logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>>  {
>>@@ -3348,6 +3380,8 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>>  		engine->emit_bb_start = gen8_emit_bb_start;
>>  	else
>>  		engine->emit_bb_start = gen8_emit_bb_start_noarb;
>>+
>>+	engine->busyness = execlists_engine_busyness;
>>  }
>>  static void logical_ring_default_irqs(struct intel_engine_cs *engine)
>>diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
>>index 524eaf678790..b4a8594bc46c 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
>>+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
>>@@ -86,6 +86,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
>>  	intel_rc6_unpark(&gt->rc6);
>>  	intel_rps_unpark(&gt->rps);
>>  	i915_pmu_gt_unparked(i915);
>>+	intel_guc_busyness_unpark(gt);
>>  	intel_gt_unpark_requests(gt);
>>  	runtime_begin(gt);
>>@@ -104,6 +105,7 @@ static int __gt_park(struct intel_wakeref *wf)
>>  	runtime_end(gt);
>>  	intel_gt_park_requests(gt);
>>+	intel_guc_busyness_park(gt);
>>  	i915_vma_parked(gt);
>>  	i915_pmu_gt_parked(i915);
>>  	intel_rps_park(&gt->rps);
>>diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>>index 8ff582222aff..ff1311d4beff 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>>+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>>@@ -143,6 +143,7 @@ enum intel_guc_action {
>>  	INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506,
>>  	INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
>>  	INTEL_GUC_ACTION_RESET_CLIENT = 0x5507,
>>+	INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
>>  	INTEL_GUC_ACTION_LIMIT
>>  };
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>index 5dd174babf7a..22c30dbdf63a 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>@@ -104,6 +104,8 @@ struct intel_guc {
>>  	u32 ads_regset_size;
>>  	/** @ads_golden_ctxt_size: size of the golden contexts in the ADS */
>>  	u32 ads_golden_ctxt_size;
>>+	/** @ads_engine_usage_size: size of engine usage in the ADS */
>>+	u32 ads_engine_usage_size;
>>  	/** @lrc_desc_pool: object allocated to hold the GuC LRC descriptor pool */
>>  	struct i915_vma *lrc_desc_pool;
>>@@ -138,6 +140,30 @@ struct intel_guc {
>>  	/** @send_mutex: used to serialize the intel_guc_send actions */
>>  	struct mutex send_mutex;
>>+
>>+	struct {
>>+		/**
>>+		 * @lock: Lock protecting the below fields and the engine stats.
>>+		 */
>>+		spinlock_t lock;
>>+
>>+		/**
>>+		 * @gt_stamp: 64 bit extended value of the GT timestamp.
>>+		 */
>>+		u64 gt_stamp;
>>+
>>+		/**
>>+		 * @ping_delay: Period for polling the GT timestamp for
>>+		 * overflow.
>>+		 */
>>+		unsigned long ping_delay;
>>+
>>+		/**
>>+		 * @work: Periodic work to adjust GT timestamp, engine and
>>+		 * context usage for overflows.
>>+		 */
>>+		struct delayed_work work;
>>+	} timestamp;
>>  };
>>  static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>>index 2c6ea64af7ec..ca9ab53999d5 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>>@@ -26,6 +26,8 @@
>>   *      | guc_policies                          |
>>   *      +---------------------------------------+
>>   *      | guc_gt_system_info                    |
>>+ *      +---------------------------------------+
>>+ *      | guc_engine_usage                      |
>>   *      +---------------------------------------+ <== static
>>   *      | guc_mmio_reg[countA] (engine 0.0)     |
>>   *      | guc_mmio_reg[countB] (engine 0.1)     |
>>@@ -47,6 +49,7 @@ struct __guc_ads_blob {
>>  	struct guc_ads ads;
>>  	struct guc_policies policies;
>>  	struct guc_gt_system_info system_info;
>>+	struct guc_engine_usage engine_usage;
>>  	/* From here on, location is dynamic! Refer to above diagram. */
>>  	struct guc_mmio_reg regset[0];
>>  } __packed;
>>@@ -628,3 +631,21 @@ void intel_guc_ads_reset(struct intel_guc *guc)
>>  	guc_ads_private_data_reset(guc);
>>  }
>>+
>>+u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
>>+{
>>+	struct __guc_ads_blob *blob = guc->ads_blob;
>>+	u32 base = intel_guc_ggtt_offset(guc, guc->ads_vma);
>>+	u32 offset = base + ptr_offset(blob, engine_usage);
>>+
>>+	return offset;
>>+}
>>+
>>+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine)
>>+{
>>+	struct intel_guc *guc = &engine->gt->uc.guc;
>>+	struct __guc_ads_blob *blob = guc->ads_blob;
>>+	u8 guc_class = engine_class_to_guc_class(engine->class);
>>+
>>+	return &blob->engine_usage.engines[guc_class][engine->instance];
>>+}
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
>>index 3d85051d57e4..e74c110facff 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
>>@@ -6,8 +6,11 @@
>>  #ifndef _INTEL_GUC_ADS_H_
>>  #define _INTEL_GUC_ADS_H_
>>+#include <linux/types.h>
>>+
>>  struct intel_guc;
>>  struct drm_printer;
>>+struct intel_engine_cs;
>>  int intel_guc_ads_create(struct intel_guc *guc);
>>  void intel_guc_ads_destroy(struct intel_guc *guc);
>>@@ -15,5 +18,7 @@ void intel_guc_ads_init_late(struct intel_guc *guc);
>>  void intel_guc_ads_reset(struct intel_guc *guc);
>>  void intel_guc_ads_print_policy_info(struct intel_guc *guc,
>>  				     struct drm_printer *p);
>>+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine);
>>+u32 intel_guc_engine_usage_offset(struct intel_guc *guc);
>>  #endif
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>index fa4be13c8854..7c9c081670fc 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>@@ -294,6 +294,19 @@ struct guc_ads {
>>  	u32 reserved[15];
>>  } __packed;
>>+/* Engine usage stats */
>>+struct guc_engine_usage_record {
>>+	u32 current_context_index;
>>+	u32 last_switch_in_stamp;
>>+	u32 reserved0;
>>+	u32 total_runtime;
>>+	u32 reserved1[4];
>>+} __packed;
>>+
>>+struct guc_engine_usage {
>>+	struct guc_engine_usage_record engines[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
>>+} __packed;
>>+
>>  /* GuC logging structures */
>>  enum guc_log_buffer_type {
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>index ba0de35f6323..f0c27ae2cecc 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>@@ -12,6 +12,7 @@
>>  #include "gt/intel_engine_pm.h"
>>  #include "gt/intel_engine_heartbeat.h"
>>  #include "gt/intel_gt.h"
>>+#include "gt/intel_gt_clock_utils.h"
>>  #include "gt/intel_gt_irq.h"
>>  #include "gt/intel_gt_pm.h"
>>  #include "gt/intel_gt_requests.h"
>>@@ -20,6 +21,7 @@
>>  #include "gt/intel_mocs.h"
>>  #include "gt/intel_ring.h"
>>+#include "intel_guc_ads.h"
>>  #include "intel_guc_submission.h"
>>  #include "i915_drv.h"
>>@@ -750,6 +752,233 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
>>  	xa_unlock_irqrestore(&guc->context_lookup, flags);
>>  }
>>+/*
>>+ * GuC stores busyness stats for each engine at context in/out boundaries. A
>>+ * context 'in' logs execution start time, 'out' adds in -> out delta to total.
>>+ * i915/kmd accesses 'start', 'total' and 'context id' from memory shared with
>>+ * GuC.
>>+ *
>>+ * __i915_pmu_event_read samples engine busyness. When sampling, if context id
>>+ * is valid (!= ~0) and start is non-zero, the engine is considered to be
>>+ * active. For an active engine total busyness = total + (now - start), where
>>+ * 'now' is the time at which the busyness is sampled. For inactive engine,
>>+ * total busyness = total.
>>+ *
>>+ * All times are captured from GUCPMTIMESTAMP reg and are in gt clock domain.
>>+ *
>>+ * The start and total values provided by GuC are 32 bits and wrap around in a
>>+ * few minutes. Since perf pmu provides busyness as 64 bit monotonically
>>+ * increasing ns values, there is a need for this implementation to account for
>>+ * overflows and extend the GuC provided values to 64 bits before returning
>>+ * busyness to the user. In order to do that, a worker runs periodically at
>>+ * frequency = 1/8th the time it takes for the timestamp to wrap (i.e. once in
>>+ * 27 seconds for a gt clock frequency of 19.2 MHz).
>>+ */
>>+
>>+#define WRAP_TIME_CLKS U32_MAX
>>+#define POLL_TIME_CLKS (WRAP_TIME_CLKS >> 3)
>>+
>>+static void
>>+__extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start)
>>+{
>>+	u32 gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
>>+	u32 gt_stamp_last = lower_32_bits(guc->timestamp.gt_stamp);
>>+
>>+	if (new_start == lower_32_bits(*prev_start))
>>+		return;
>>+
>>+	if (new_start < gt_stamp_last &&
>>+	    (new_start - gt_stamp_last) <= POLL_TIME_CLKS)
>>+		gt_stamp_hi++;
>>+
>>+	if (new_start > gt_stamp_last &&
>>+	    (gt_stamp_last - new_start) <= POLL_TIME_CLKS && gt_stamp_hi)
>>+		gt_stamp_hi--;
>>+
>>+	*prev_start = ((u64)gt_stamp_hi << 32) | new_start;
>>+}
>>+
>>+static void guc_update_engine_gt_clks(struct intel_engine_cs *engine)
>>+{
>>+	struct guc_engine_usage_record *rec = intel_guc_engine_usage(engine);
>>+	struct intel_engine_guc_stats *stats = &engine->stats.guc;
>>+	struct intel_guc *guc = &engine->gt->uc.guc;
>>+	u32 last_switch = rec->last_switch_in_stamp;
>>+	u32 ctx_id = rec->current_context_index;
>>+	u32 total = rec->total_runtime;
>>+
>>+	lockdep_assert_held(&guc->timestamp.lock);
>>+
>>+	stats->running = ctx_id != ~0U && last_switch;
>>+	if (stats->running)
>>+		__extend_last_switch(guc, &stats->start_gt_clk, last_switch);
>>+
>>+	/*
>>+	 * Instead of adjusting the total for overflow, just add the
>>+	 * difference from previous sample stats->total_gt_clks
>>+	 */
>>+	if (total && total != ~0U) {
>>+		stats->total_gt_clks += (u32)(total - stats->prev_total);
>>+		stats->prev_total = total;
>>+	}
>>+}
>>+
>>+static void guc_update_pm_timestamp(struct intel_guc *guc)
>>+{
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>+	u32 gt_stamp_now, gt_stamp_hi;
>>+
>>+	lockdep_assert_held(&guc->timestamp.lock);
>>+
>>+	gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
>>+	gt_stamp_now = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
>>+
>>+	if (gt_stamp_now < lower_32_bits(guc->timestamp.gt_stamp))
>>+		gt_stamp_hi++;
>>+
>>+	guc->timestamp.gt_stamp = ((u64) gt_stamp_hi << 32) | gt_stamp_now;
>>+}
>>+
>>+/*
>>+ * Unlike the execlist mode of submission total and active times are in terms of
>>+ * gt clocks. The *now parameter is retained to return the cpu time at which the
>>+ * busyness was sampled.
>>+ */
>>+static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
>>+{
>>+	struct intel_engine_guc_stats *stats = &engine->stats.guc;
>>+	struct intel_gt *gt = engine->gt;
>>+	struct intel_guc *guc = &gt->uc.guc;
>>+	unsigned long flags;
>>+	u64 total;
>>+
>>+	spin_lock_irqsave(&guc->timestamp.lock, flags);
>>+
>>+	*now = ktime_get();
>>+
>>+	/*
>>+	 * The active busyness depends on start_gt_clk and gt_stamp.
>>+	 * gt_stamp is updated by i915 only when gt is awake and the
>>+	 * start_gt_clk is derived from GuC state. To get a consistent
>>+	 * view of activity, we query the GuC state only if gt is awake.
>>+	 */
>>+	if (intel_gt_pm_get_if_awake(gt)) {
>>+		guc_update_engine_gt_clks(engine);
>
>Reset can happen at any point theoretically like here, right? Or...
>
>>+		guc_update_pm_timestamp(guc);
>>+		intel_gt_pm_put_async(gt);
>>+	}
>>+
>>+	total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks);
>>+	if (stats->running) {
>>+		u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk;
>>+
>>+		total += intel_gt_clock_interval_to_ns(gt, clk);
>>+	}
>>+
>>+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>+
>>+	return ns_to_ktime(total);
>>+}
>>+
>>+static void __reset_guc_busyness_stats(struct intel_guc *guc)
>>+{
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>+	struct intel_engine_cs *engine;
>>+	enum intel_engine_id id;
>>+	unsigned long flags;
>>+
>>+	cancel_delayed_work_sync(&guc->timestamp.work);
>>+
>>+	spin_lock_irqsave(&guc->timestamp.lock, flags);
>>+
>>+	guc_update_pm_timestamp(guc);
>>+	for_each_engine(engine, gt, id) {
>>+		guc_update_engine_gt_clks(engine);
>>+		engine->stats.guc.prev_total = 0;
>>+	}
>>+
>>+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>+}
>>+
>>+static void __update_guc_busyness_stats(struct intel_guc *guc)
>>+{
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>+	struct intel_engine_cs *engine;
>>+	enum intel_engine_id id;
>>+	unsigned long flags;
>>+
>>+	spin_lock_irqsave(&guc->timestamp.lock, flags);
>>+
>>+	guc_update_pm_timestamp(guc);
>>+	for_each_engine(engine, gt, id)
>
>... even here when called from guc_timestamp_ping. Both cases would 
>"corrupt" the saved state due potential to read partially clear data 
>from the shared page?
>
>Looking around the code base it should be possible to use 
>intel_gt_reset_trylock and intel_gt_reset_unlock from the worker, but 
>from the PMU callback you can't sleep so you'd just need a new helper, 
>like a /real/ trylock which just returns error if it fails to lock and 
>then you treat it the same way as if you failed to get runtime pm ref. 
>Does that make sense?

fwiu..

You are suggesting I use intel_gt_reset_trylock instead of uc->reset_in_progress 
below. I thought flag would be sufficient.

For PMU callback, why not just use the same uc->reset_in_progress? If reset is 
in progress, we treat it like failure to get pm wakeref.

On the other hand, I don't mind adding intel_gt_reset_trylock to ping, but not 
clear how the PMU callback will avoid sleeping because the reset lock itself 
(gt->reset.backoff_srcu) is a sleepable rcu. Thinking something like this...?

int intel_gt_reset_sleepless_trylock(struct intel_gt *gt, int *srcu)
{
	int reset_in_progress;

	might_lock(&gt->reset.backoff_srcu);

	rcu_read_lock();
	reset_in_progress = test_bit(I915_RESET_BACKOFF, &gt->reset.flags);
	*srcu = srcu_read_lock(&gt->reset.backoff_srcu);
	rcu_read_unlock();

	return reset_in_progress;
}

paired with intel_gt_reset_unlock().

Thanks,
Umesh

>
>Regards,
>
>Tvrtko
>
>
>>+		guc_update_engine_gt_clks(engine);
>>+
>>+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>+}
>>+
>>+static void guc_timestamp_ping(struct work_struct *wrk)
>>+{
>>+	struct intel_guc *guc = container_of(wrk, typeof(*guc),
>>+					     timestamp.work.work);
>>+	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>+	intel_wakeref_t wakeref;
>>+
>>+	if (uc->reset_in_progress)
>>+		return;
>>+
>>+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
>>+		__update_guc_busyness_stats(guc);
>>+
>>+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>+			 guc->timestamp.ping_delay);
>>+}
>>+
>>+static int guc_action_enable_usage_stats(struct intel_guc *guc)
>>+{
>>+	u32 offset = intel_guc_engine_usage_offset(guc);
>>+	u32 action[] = {
>>+		INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF,
>>+		offset,
>>+		0,
>>+	};
>>+
>>+	return intel_guc_send(guc, action, ARRAY_SIZE(action));
>>+}
>>+
>>+static void guc_init_engine_stats(struct intel_guc *guc)
>>+{
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>+	intel_wakeref_t wakeref;
>>+
>>+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>+			 guc->timestamp.ping_delay);
>>+
>>+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref) {
>>+		int ret = guc_action_enable_usage_stats(guc);
>>+
>>+		if (ret)
>>+			drm_err(&gt->i915->drm,
>>+				"Failed to enable usage stats: %d!\n", ret);
>>+	}
>>+}
>>+
>>+void intel_guc_busyness_park(struct intel_gt *gt)
>>+{
>>+	struct intel_guc *guc = &gt->uc.guc;
>>+
>>+	cancel_delayed_work(&guc->timestamp.work);
>>+	__update_guc_busyness_stats(guc);
>>+}
>>+
>>+void intel_guc_busyness_unpark(struct intel_gt *gt)
>>+{
>>+	struct intel_guc *guc = &gt->uc.guc;
>>+
>>+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>+			 guc->timestamp.ping_delay);
>>+}
>>+
>>  static inline bool
>>  submission_disabled(struct intel_guc *guc)
>>  {
>>@@ -809,6 +1038,7 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc)
>>  	intel_gt_park_heartbeats(guc_to_gt(guc));
>>  	disable_submission(guc);
>>  	guc->interrupts.disable(guc);
>>+	__reset_guc_busyness_stats(guc);
>>  	/* Flush IRQ handler */
>>  	spin_lock_irq(&guc_to_gt(guc)->irq_lock);
>>@@ -1132,6 +1362,7 @@ void intel_guc_submission_reset_finish(struct intel_guc *guc)
>>   */
>>  int intel_guc_submission_init(struct intel_guc *guc)
>>  {
>>+	struct intel_gt *gt = guc_to_gt(guc);
>>  	int ret;
>>  	if (guc->lrc_desc_pool)
>>@@ -1152,6 +1383,10 @@ int intel_guc_submission_init(struct intel_guc *guc)
>>  	INIT_LIST_HEAD(&guc->guc_id_list);
>>  	ida_init(&guc->guc_ids);
>>+	spin_lock_init(&guc->timestamp.lock);
>>+	INIT_DELAYED_WORK(&guc->timestamp.work, guc_timestamp_ping);
>>+	guc->timestamp.ping_delay = (POLL_TIME_CLKS / gt->clock_frequency + 1) * HZ;
>>+
>>  	return 0;
>>  }
>>@@ -2606,7 +2841,9 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
>>  		engine->emit_flush = gen12_emit_flush_xcs;
>>  	}
>>  	engine->set_default_submission = guc_set_default_submission;
>>+	engine->busyness = guc_engine_busyness;
>>+	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
>>  	engine->flags |= I915_ENGINE_HAS_PREEMPTION;
>>  	engine->flags |= I915_ENGINE_HAS_TIMESLICES;
>>@@ -2705,6 +2942,7 @@ int intel_guc_submission_setup(struct intel_engine_cs *engine)
>>  void intel_guc_submission_enable(struct intel_guc *guc)
>>  {
>>  	guc_init_lrc_mapping(guc);
>>+	guc_init_engine_stats(guc);
>>  }
>>  void intel_guc_submission_disable(struct intel_guc *guc)
>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
>>index c7ef44fa0c36..5a95a9f0a8e3 100644
>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
>>@@ -28,6 +28,8 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
>>  void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
>>  				    struct i915_request *hung_rq,
>>  				    struct drm_printer *m);
>>+void intel_guc_busyness_park(struct intel_gt *gt);
>>+void intel_guc_busyness_unpark(struct intel_gt *gt);
>>  bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
>>diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
>>index a897f4abea0c..9aee08425382 100644
>>--- a/drivers/gpu/drm/i915/i915_reg.h
>>+++ b/drivers/gpu/drm/i915/i915_reg.h
>>@@ -2664,6 +2664,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>>  #define   RING_WAIT		(1 << 11) /* gen3+, PRBx_CTL */
>>  #define   RING_WAIT_SEMAPHORE	(1 << 10) /* gen6+ */
>>+#define GUCPMTIMESTAMP          _MMIO(0xC3E8)
>>+
>>  /* There are 16 64-bit CS General Purpose Registers per-engine on Gen8+ */
>>  #define GEN8_RING_CS_GPR(base, n)	_MMIO((base) + 0x600 + (n) * 8)
>>  #define GEN8_RING_CS_GPR_UDW(base, n)	_MMIO((base) + 0x600 + (n) * 8 + 4)
>>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-07 22:55 ` [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu Umesh Nerlige Ramappa
@ 2021-10-11 11:41   ` Tvrtko Ursulin
  2021-10-11 20:08     ` Umesh Nerlige Ramappa
  0 siblings, 1 reply; 31+ messages in thread
From: Tvrtko Ursulin @ 2021-10-11 11:41 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa, intel-gfx, dri-devel
  Cc: john.c.harrison, daniel.vetter, Matthew Brost


On 07/10/2021 23:55, Umesh Nerlige Ramappa wrote:
> With GuC handling scheduling, i915 is not aware of the time that a
> context is scheduled in and out of the engine. Since i915 pmu relies on
> this info to provide engine busyness to the user, GuC shares this info
> with i915 for all engines using shared memory. For each engine, this
> info contains:
> 
> - total busyness: total time that the context was running (total)
> - id: id of the running context (id)
> - start timestamp: timestamp when the context started running (start)
> 
> At the time (now) of sampling the engine busyness, if the id is valid
> (!= ~0), and start is non-zero, then the context is considered to be
> active and the engine busyness is calculated using the below equation
> 
> 	engine busyness = total + (now - start)
> 
> All times are obtained from the gt clock base. For inactive contexts,
> engine busyness is just equal to the total.
> 
> The start and total values provided by GuC are 32 bits and wrap around
> in a few minutes. Since perf pmu provides busyness as 64 bit
> monotonically increasing values, there is a need for this implementation
> to account for overflows and extend the time to 64 bits before returning
> busyness to the user. In order to do that, a worker runs periodically at
> frequency = 1/8th the time it takes for the timestamp to wrap. As an
> example, that would be once in 27 seconds for a gt clock frequency of
> 19.2 MHz.
> 
> Note:
> There might be an overaccounting of busyness due to the fact that GuC
> may be updating the total and start values while kmd is reading them.
> (i.e kmd may read the updated total and the stale start). In such a
> case, user may see higher busyness value followed by smaller ones which
> would eventually catch up to the higher value.
> 
> v2: (Tvrtko)
> - Include details in commit message
> - Move intel engine busyness function into execlist code
> - Use union inside engine->stats
> - Use natural type for ping delay jiffies
> - Drop active_work condition checks
> - Use for_each_engine if iterating all engines
> - Drop seq locking, use spinlock at guc level to update engine stats
> - Document worker specific details
> 
> v3: (Tvrtko/Umesh)
> - Demarcate guc and execlist stat objects with comments
> - Document known over-accounting issue in commit
> - Provide a consistent view of guc state
> - Add hooks to gt park/unpark for guc busyness
> - Stop/start worker in gt park/unpark path
> - Drop inline
> - Move spinlock and worker inits to guc initialization
> - Drop helpers that are called only once
> 
> v4: (Tvrtko/Matt/Umesh)
> - Drop addressed opens from commit message
> - Get runtime pm in ping, remove from the park path
> - Use cancel_delayed_work_sync in disable_submission path
> - Update stats during reset prepare
> - Skip ping if reset in progress
> - Explicitly name execlists and guc stats objects
> - Since disable_submission is called from many places, move resetting
>    stats to intel_guc_submission_reset_prepare
> 
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  28 +--
>   drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
>   .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
>   drivers/gpu/drm/i915/gt/intel_gt_pm.c         |   2 +
>   .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
>   drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  26 ++
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  21 ++
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   5 +
>   drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 238 ++++++++++++++++++
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
>   drivers/gpu/drm/i915/i915_reg.h               |   2 +
>   12 files changed, 377 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 38436f4b5706..6b783fdcba2a 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>   	intel_engine_print_breadcrumbs(engine, m);
>   }
>   
> -static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
> -					    ktime_t *now)
> -{
> -	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
> -	ktime_t total = stats->total;
> -
> -	/*
> -	 * If the engine is executing something at the moment
> -	 * add it to the total.
> -	 */
> -	*now = ktime_get();
> -	if (READ_ONCE(stats->active))
> -		total = ktime_add(total, ktime_sub(*now, stats->start));
> -
> -	return total;
> -}
> -
>   /**
>    * intel_engine_get_busy_time() - Return current accumulated engine busyness
>    * @engine: engine to report on
> @@ -1899,16 +1882,7 @@ static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
>    */
>   ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now)
>   {
> -	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
> -	unsigned int seq;
> -	ktime_t total;
> -
> -	do {
> -		seq = read_seqcount_begin(&stats->lock);
> -		total = __intel_engine_get_busy_time(engine, now);
> -	} while (read_seqcount_retry(&stats->lock, seq));
> -
> -	return total;
> +	return engine->busyness(engine, now);
>   }
>   
>   struct intel_context *
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 316d8551d22f..4eb09d07419a 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -284,6 +284,28 @@ struct intel_engine_execlists_stats {
>   	ktime_t start;
>   };
>   
> +struct intel_engine_guc_stats {
> +	/**
> +	 * @running: Active state of the engine when busyness was last sampled.
> +	 */
> +	bool running;
> +
> +	/**
> +	 * @prev_total: Previous value of total runtime clock cycles.
> +	 */
> +	u32 prev_total;
> +
> +	/**
> +	 * @total_gt_clks: Total gt clock cycles this engine was busy.
> +	 */
> +	u64 total_gt_clks;
> +
> +	/**
> +	 * @start_gt_clk: GT clock time of last idle to active transition.
> +	 */
> +	u64 start_gt_clk;
> +};
> +
>   struct intel_engine_cs {
>   	struct drm_i915_private *i915;
>   	struct intel_gt *gt;
> @@ -459,6 +481,12 @@ struct intel_engine_cs {
>   	void		(*add_active_request)(struct i915_request *rq);
>   	void		(*remove_active_request)(struct i915_request *rq);
>   
> +	/*
> +	 * Get engine busyness and the time at which the busyness was sampled.
> +	 */
> +	ktime_t		(*busyness)(struct intel_engine_cs *engine,
> +				    ktime_t *now);
> +
>   	struct intel_engine_execlists execlists;
>   
>   	/*
> @@ -508,7 +536,10 @@ struct intel_engine_cs {
>   	u32 (*get_cmd_length_mask)(u32 cmd_header);
>   
>   	struct {
> -		struct intel_engine_execlists_stats execlists;
> +		union {
> +			struct intel_engine_execlists_stats execlists;
> +			struct intel_engine_guc_stats guc;
> +		};
>   
>   		/**
>   		 * @rps: Utilisation at last RPS sampling.
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 7147fe80919e..6bece961eeb1 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -3292,6 +3292,38 @@ static void execlists_release(struct intel_engine_cs *engine)
>   	lrc_fini_wa_ctx(engine);
>   }
>   
> +static ktime_t __execlists_engine_busyness(struct intel_engine_cs *engine,
> +					   ktime_t *now)
> +{
> +	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
> +	ktime_t total = stats->total;
> +
> +	/*
> +	 * If the engine is executing something at the moment
> +	 * add it to the total.
> +	 */
> +	*now = ktime_get();
> +	if (READ_ONCE(stats->active))
> +		total = ktime_add(total, ktime_sub(*now, stats->start));
> +
> +	return total;
> +}
> +
> +static ktime_t execlists_engine_busyness(struct intel_engine_cs *engine,
> +					 ktime_t *now)
> +{
> +	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
> +	unsigned int seq;
> +	ktime_t total;
> +
> +	do {
> +		seq = read_seqcount_begin(&stats->lock);
> +		total = __execlists_engine_busyness(engine, now);
> +	} while (read_seqcount_retry(&stats->lock, seq));
> +
> +	return total;
> +}
> +
>   static void
>   logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>   {
> @@ -3348,6 +3380,8 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>   		engine->emit_bb_start = gen8_emit_bb_start;
>   	else
>   		engine->emit_bb_start = gen8_emit_bb_start_noarb;
> +
> +	engine->busyness = execlists_engine_busyness;
>   }
>   
>   static void logical_ring_default_irqs(struct intel_engine_cs *engine)
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> index 524eaf678790..b4a8594bc46c 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> @@ -86,6 +86,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
>   	intel_rc6_unpark(&gt->rc6);
>   	intel_rps_unpark(&gt->rps);
>   	i915_pmu_gt_unparked(i915);
> +	intel_guc_busyness_unpark(gt);
>   
>   	intel_gt_unpark_requests(gt);
>   	runtime_begin(gt);
> @@ -104,6 +105,7 @@ static int __gt_park(struct intel_wakeref *wf)
>   	runtime_end(gt);
>   	intel_gt_park_requests(gt);
>   
> +	intel_guc_busyness_park(gt);
>   	i915_vma_parked(gt);
>   	i915_pmu_gt_parked(i915);
>   	intel_rps_park(&gt->rps);
> diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
> index 8ff582222aff..ff1311d4beff 100644
> --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
> +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
> @@ -143,6 +143,7 @@ enum intel_guc_action {
>   	INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506,
>   	INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
>   	INTEL_GUC_ACTION_RESET_CLIENT = 0x5507,
> +	INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
>   	INTEL_GUC_ACTION_LIMIT
>   };
>   
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> index 5dd174babf7a..22c30dbdf63a 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> @@ -104,6 +104,8 @@ struct intel_guc {
>   	u32 ads_regset_size;
>   	/** @ads_golden_ctxt_size: size of the golden contexts in the ADS */
>   	u32 ads_golden_ctxt_size;
> +	/** @ads_engine_usage_size: size of engine usage in the ADS */
> +	u32 ads_engine_usage_size;
>   
>   	/** @lrc_desc_pool: object allocated to hold the GuC LRC descriptor pool */
>   	struct i915_vma *lrc_desc_pool;
> @@ -138,6 +140,30 @@ struct intel_guc {
>   
>   	/** @send_mutex: used to serialize the intel_guc_send actions */
>   	struct mutex send_mutex;
> +
> +	struct {
> +		/**
> +		 * @lock: Lock protecting the below fields and the engine stats.
> +		 */
> +		spinlock_t lock;
> +
> +		/**
> +		 * @gt_stamp: 64 bit extended value of the GT timestamp.
> +		 */
> +		u64 gt_stamp;
> +
> +		/**
> +		 * @ping_delay: Period for polling the GT timestamp for
> +		 * overflow.
> +		 */
> +		unsigned long ping_delay;
> +
> +		/**
> +		 * @work: Periodic work to adjust GT timestamp, engine and
> +		 * context usage for overflows.
> +		 */
> +		struct delayed_work work;
> +	} timestamp;
>   };
>   
>   static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
> index 2c6ea64af7ec..ca9ab53999d5 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
> @@ -26,6 +26,8 @@
>    *      | guc_policies                          |
>    *      +---------------------------------------+
>    *      | guc_gt_system_info                    |
> + *      +---------------------------------------+
> + *      | guc_engine_usage                      |
>    *      +---------------------------------------+ <== static
>    *      | guc_mmio_reg[countA] (engine 0.0)     |
>    *      | guc_mmio_reg[countB] (engine 0.1)     |
> @@ -47,6 +49,7 @@ struct __guc_ads_blob {
>   	struct guc_ads ads;
>   	struct guc_policies policies;
>   	struct guc_gt_system_info system_info;
> +	struct guc_engine_usage engine_usage;
>   	/* From here on, location is dynamic! Refer to above diagram. */
>   	struct guc_mmio_reg regset[0];
>   } __packed;
> @@ -628,3 +631,21 @@ void intel_guc_ads_reset(struct intel_guc *guc)
>   
>   	guc_ads_private_data_reset(guc);
>   }
> +
> +u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
> +{
> +	struct __guc_ads_blob *blob = guc->ads_blob;
> +	u32 base = intel_guc_ggtt_offset(guc, guc->ads_vma);
> +	u32 offset = base + ptr_offset(blob, engine_usage);
> +
> +	return offset;
> +}
> +
> +struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine)
> +{
> +	struct intel_guc *guc = &engine->gt->uc.guc;
> +	struct __guc_ads_blob *blob = guc->ads_blob;
> +	u8 guc_class = engine_class_to_guc_class(engine->class);
> +
> +	return &blob->engine_usage.engines[guc_class][engine->instance];
> +}
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
> index 3d85051d57e4..e74c110facff 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
> @@ -6,8 +6,11 @@
>   #ifndef _INTEL_GUC_ADS_H_
>   #define _INTEL_GUC_ADS_H_
>   
> +#include <linux/types.h>
> +
>   struct intel_guc;
>   struct drm_printer;
> +struct intel_engine_cs;
>   
>   int intel_guc_ads_create(struct intel_guc *guc);
>   void intel_guc_ads_destroy(struct intel_guc *guc);
> @@ -15,5 +18,7 @@ void intel_guc_ads_init_late(struct intel_guc *guc);
>   void intel_guc_ads_reset(struct intel_guc *guc);
>   void intel_guc_ads_print_policy_info(struct intel_guc *guc,
>   				     struct drm_printer *p);
> +struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine);
> +u32 intel_guc_engine_usage_offset(struct intel_guc *guc);
>   
>   #endif
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> index fa4be13c8854..7c9c081670fc 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> @@ -294,6 +294,19 @@ struct guc_ads {
>   	u32 reserved[15];
>   } __packed;
>   
> +/* Engine usage stats */
> +struct guc_engine_usage_record {
> +	u32 current_context_index;
> +	u32 last_switch_in_stamp;
> +	u32 reserved0;
> +	u32 total_runtime;
> +	u32 reserved1[4];
> +} __packed;
> +
> +struct guc_engine_usage {
> +	struct guc_engine_usage_record engines[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
> +} __packed;
> +
>   /* GuC logging structures */
>   
>   enum guc_log_buffer_type {
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index ba0de35f6323..f0c27ae2cecc 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -12,6 +12,7 @@
>   #include "gt/intel_engine_pm.h"
>   #include "gt/intel_engine_heartbeat.h"
>   #include "gt/intel_gt.h"
> +#include "gt/intel_gt_clock_utils.h"
>   #include "gt/intel_gt_irq.h"
>   #include "gt/intel_gt_pm.h"
>   #include "gt/intel_gt_requests.h"
> @@ -20,6 +21,7 @@
>   #include "gt/intel_mocs.h"
>   #include "gt/intel_ring.h"
>   
> +#include "intel_guc_ads.h"
>   #include "intel_guc_submission.h"
>   
>   #include "i915_drv.h"
> @@ -750,6 +752,233 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
>   	xa_unlock_irqrestore(&guc->context_lookup, flags);
>   }
>   
> +/*
> + * GuC stores busyness stats for each engine at context in/out boundaries. A
> + * context 'in' logs execution start time, 'out' adds in -> out delta to total.
> + * i915/kmd accesses 'start', 'total' and 'context id' from memory shared with
> + * GuC.
> + *
> + * __i915_pmu_event_read samples engine busyness. When sampling, if context id
> + * is valid (!= ~0) and start is non-zero, the engine is considered to be
> + * active. For an active engine total busyness = total + (now - start), where
> + * 'now' is the time at which the busyness is sampled. For inactive engine,
> + * total busyness = total.
> + *
> + * All times are captured from GUCPMTIMESTAMP reg and are in gt clock domain.
> + *
> + * The start and total values provided by GuC are 32 bits and wrap around in a
> + * few minutes. Since perf pmu provides busyness as 64 bit monotonically
> + * increasing ns values, there is a need for this implementation to account for
> + * overflows and extend the GuC provided values to 64 bits before returning
> + * busyness to the user. In order to do that, a worker runs periodically at
> + * frequency = 1/8th the time it takes for the timestamp to wrap (i.e. once in
> + * 27 seconds for a gt clock frequency of 19.2 MHz).
> + */
> +
> +#define WRAP_TIME_CLKS U32_MAX
> +#define POLL_TIME_CLKS (WRAP_TIME_CLKS >> 3)
> +
> +static void
> +__extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start)
> +{
> +	u32 gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
> +	u32 gt_stamp_last = lower_32_bits(guc->timestamp.gt_stamp);
> +
> +	if (new_start == lower_32_bits(*prev_start))
> +		return;
> +
> +	if (new_start < gt_stamp_last &&
> +	    (new_start - gt_stamp_last) <= POLL_TIME_CLKS)
> +		gt_stamp_hi++;
> +
> +	if (new_start > gt_stamp_last &&
> +	    (gt_stamp_last - new_start) <= POLL_TIME_CLKS && gt_stamp_hi)
> +		gt_stamp_hi--;
> +
> +	*prev_start = ((u64)gt_stamp_hi << 32) | new_start;
> +}
> +
> +static void guc_update_engine_gt_clks(struct intel_engine_cs *engine)
> +{
> +	struct guc_engine_usage_record *rec = intel_guc_engine_usage(engine);
> +	struct intel_engine_guc_stats *stats = &engine->stats.guc;
> +	struct intel_guc *guc = &engine->gt->uc.guc;
> +	u32 last_switch = rec->last_switch_in_stamp;
> +	u32 ctx_id = rec->current_context_index;
> +	u32 total = rec->total_runtime;
> +
> +	lockdep_assert_held(&guc->timestamp.lock);
> +
> +	stats->running = ctx_id != ~0U && last_switch;
> +	if (stats->running)
> +		__extend_last_switch(guc, &stats->start_gt_clk, last_switch);
> +
> +	/*
> +	 * Instead of adjusting the total for overflow, just add the
> +	 * difference from previous sample stats->total_gt_clks
> +	 */
> +	if (total && total != ~0U) {
> +		stats->total_gt_clks += (u32)(total - stats->prev_total);
> +		stats->prev_total = total;
> +	}
> +}
> +
> +static void guc_update_pm_timestamp(struct intel_guc *guc)
> +{
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	u32 gt_stamp_now, gt_stamp_hi;
> +
> +	lockdep_assert_held(&guc->timestamp.lock);
> +
> +	gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
> +	gt_stamp_now = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
> +
> +	if (gt_stamp_now < lower_32_bits(guc->timestamp.gt_stamp))
> +		gt_stamp_hi++;
> +
> +	guc->timestamp.gt_stamp = ((u64) gt_stamp_hi << 32) | gt_stamp_now;
> +}
> +
> +/*
> + * Unlike the execlist mode of submission total and active times are in terms of
> + * gt clocks. The *now parameter is retained to return the cpu time at which the
> + * busyness was sampled.
> + */
> +static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
> +{
> +	struct intel_engine_guc_stats *stats = &engine->stats.guc;
> +	struct intel_gt *gt = engine->gt;
> +	struct intel_guc *guc = &gt->uc.guc;
> +	unsigned long flags;
> +	u64 total;
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	*now = ktime_get();
> +
> +	/*
> +	 * The active busyness depends on start_gt_clk and gt_stamp.
> +	 * gt_stamp is updated by i915 only when gt is awake and the
> +	 * start_gt_clk is derived from GuC state. To get a consistent
> +	 * view of activity, we query the GuC state only if gt is awake.
> +	 */
> +	if (intel_gt_pm_get_if_awake(gt)) {
> +		guc_update_engine_gt_clks(engine);

Reset can happen at any point theoretically like here, right? Or...

> +		guc_update_pm_timestamp(guc);
> +		intel_gt_pm_put_async(gt);
> +	}
> +
> +	total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks);
> +	if (stats->running) {
> +		u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk;
> +
> +		total += intel_gt_clock_interval_to_ns(gt, clk);
> +	}
> +
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +
> +	return ns_to_ktime(total);
> +}
> +
> +static void __reset_guc_busyness_stats(struct intel_guc *guc)
> +{
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	unsigned long flags;
> +
> +	cancel_delayed_work_sync(&guc->timestamp.work);
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	guc_update_pm_timestamp(guc);
> +	for_each_engine(engine, gt, id) {
> +		guc_update_engine_gt_clks(engine);
> +		engine->stats.guc.prev_total = 0;
> +	}
> +
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +}
> +
> +static void __update_guc_busyness_stats(struct intel_guc *guc)
> +{
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&guc->timestamp.lock, flags);
> +
> +	guc_update_pm_timestamp(guc);
> +	for_each_engine(engine, gt, id)

... even here when called from guc_timestamp_ping. Both cases would 
"corrupt" the saved state due potential to read partially clear data 
from the shared page?

Looking around the code base it should be possible to use 
intel_gt_reset_trylock and intel_gt_reset_unlock from the worker, but 
from the PMU callback you can't sleep so you'd just need a new helper, 
like a /real/ trylock which just returns error if it fails to lock and 
then you treat it the same way as if you failed to get runtime pm ref. 
Does that make sense?

Regards,

Tvrtko


> +		guc_update_engine_gt_clks(engine);
> +
> +	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
> +}
> +
> +static void guc_timestamp_ping(struct work_struct *wrk)
> +{
> +	struct intel_guc *guc = container_of(wrk, typeof(*guc),
> +					     timestamp.work.work);
> +	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	intel_wakeref_t wakeref;
> +
> +	if (uc->reset_in_progress)
> +		return;
> +
> +	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
> +		__update_guc_busyness_stats(guc);
> +
> +	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
> +			 guc->timestamp.ping_delay);
> +}
> +
> +static int guc_action_enable_usage_stats(struct intel_guc *guc)
> +{
> +	u32 offset = intel_guc_engine_usage_offset(guc);
> +	u32 action[] = {
> +		INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF,
> +		offset,
> +		0,
> +	};
> +
> +	return intel_guc_send(guc, action, ARRAY_SIZE(action));
> +}
> +
> +static void guc_init_engine_stats(struct intel_guc *guc)
> +{
> +	struct intel_gt *gt = guc_to_gt(guc);
> +	intel_wakeref_t wakeref;
> +
> +	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
> +			 guc->timestamp.ping_delay);
> +
> +	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref) {
> +		int ret = guc_action_enable_usage_stats(guc);
> +
> +		if (ret)
> +			drm_err(&gt->i915->drm,
> +				"Failed to enable usage stats: %d!\n", ret);
> +	}
> +}
> +
> +void intel_guc_busyness_park(struct intel_gt *gt)
> +{
> +	struct intel_guc *guc = &gt->uc.guc;
> +
> +	cancel_delayed_work(&guc->timestamp.work);
> +	__update_guc_busyness_stats(guc);
> +}
> +
> +void intel_guc_busyness_unpark(struct intel_gt *gt)
> +{
> +	struct intel_guc *guc = &gt->uc.guc;
> +
> +	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
> +			 guc->timestamp.ping_delay);
> +}
> +
>   static inline bool
>   submission_disabled(struct intel_guc *guc)
>   {
> @@ -809,6 +1038,7 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc)
>   	intel_gt_park_heartbeats(guc_to_gt(guc));
>   	disable_submission(guc);
>   	guc->interrupts.disable(guc);
> +	__reset_guc_busyness_stats(guc);
>   
>   	/* Flush IRQ handler */
>   	spin_lock_irq(&guc_to_gt(guc)->irq_lock);
> @@ -1132,6 +1362,7 @@ void intel_guc_submission_reset_finish(struct intel_guc *guc)
>    */
>   int intel_guc_submission_init(struct intel_guc *guc)
>   {
> +	struct intel_gt *gt = guc_to_gt(guc);
>   	int ret;
>   
>   	if (guc->lrc_desc_pool)
> @@ -1152,6 +1383,10 @@ int intel_guc_submission_init(struct intel_guc *guc)
>   	INIT_LIST_HEAD(&guc->guc_id_list);
>   	ida_init(&guc->guc_ids);
>   
> +	spin_lock_init(&guc->timestamp.lock);
> +	INIT_DELAYED_WORK(&guc->timestamp.work, guc_timestamp_ping);
> +	guc->timestamp.ping_delay = (POLL_TIME_CLKS / gt->clock_frequency + 1) * HZ;
> +
>   	return 0;
>   }
>   
> @@ -2606,7 +2841,9 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
>   		engine->emit_flush = gen12_emit_flush_xcs;
>   	}
>   	engine->set_default_submission = guc_set_default_submission;
> +	engine->busyness = guc_engine_busyness;
>   
> +	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
>   	engine->flags |= I915_ENGINE_HAS_PREEMPTION;
>   	engine->flags |= I915_ENGINE_HAS_TIMESLICES;
>   
> @@ -2705,6 +2942,7 @@ int intel_guc_submission_setup(struct intel_engine_cs *engine)
>   void intel_guc_submission_enable(struct intel_guc *guc)
>   {
>   	guc_init_lrc_mapping(guc);
> +	guc_init_engine_stats(guc);
>   }
>   
>   void intel_guc_submission_disable(struct intel_guc *guc)
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> index c7ef44fa0c36..5a95a9f0a8e3 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> @@ -28,6 +28,8 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
>   void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
>   				    struct i915_request *hung_rq,
>   				    struct drm_printer *m);
> +void intel_guc_busyness_park(struct intel_gt *gt);
> +void intel_guc_busyness_unpark(struct intel_gt *gt);
>   
>   bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
>   
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index a897f4abea0c..9aee08425382 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -2664,6 +2664,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>   #define   RING_WAIT		(1 << 11) /* gen3+, PRBx_CTL */
>   #define   RING_WAIT_SEMAPHORE	(1 << 10) /* gen6+ */
>   
> +#define GUCPMTIMESTAMP          _MMIO(0xC3E8)
> +
>   /* There are 16 64-bit CS General Purpose Registers per-engine on Gen8+ */
>   #define GEN8_RING_CS_GPR(base, n)	_MMIO((base) + 0x600 + (n) * 8)
>   #define GEN8_RING_CS_GPR_UDW(base, n)	_MMIO((base) + 0x600 + (n) * 8 + 4)
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu
  2021-10-07 22:55 [PATCH 1/2] drm/i915/pmu: Add a name to the execlists stats Umesh Nerlige Ramappa
@ 2021-10-07 22:55 ` Umesh Nerlige Ramappa
  2021-10-11 11:41   ` Tvrtko Ursulin
  0 siblings, 1 reply; 31+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-10-07 22:55 UTC (permalink / raw)
  To: intel-gfx, dri-devel
  Cc: john.c.harrison, Tvrtko Ursulin, daniel.vetter, Matthew Brost

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

	engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an overaccounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at guc level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate guc and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of guc state
- Add hooks to gt park/unpark for guc busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to guc initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and guc stats objects
- Since disable_submission is called from many places, move resetting
  stats to intel_guc_submission_reset_prepare

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  28 +--
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
 .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
 drivers/gpu/drm/i915/gt/intel_gt_pm.c         |   2 +
 .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
 drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  26 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  21 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   5 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 238 ++++++++++++++++++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
 drivers/gpu/drm/i915/i915_reg.h               |   2 +
 12 files changed, 377 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 38436f4b5706..6b783fdcba2a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	intel_engine_print_breadcrumbs(engine, m);
 }
 
-static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
-					    ktime_t *now)
-{
-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
-	ktime_t total = stats->total;
-
-	/*
-	 * If the engine is executing something at the moment
-	 * add it to the total.
-	 */
-	*now = ktime_get();
-	if (READ_ONCE(stats->active))
-		total = ktime_add(total, ktime_sub(*now, stats->start));
-
-	return total;
-}
-
 /**
  * intel_engine_get_busy_time() - Return current accumulated engine busyness
  * @engine: engine to report on
@@ -1899,16 +1882,7 @@ static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
  */
 ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now)
 {
-	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
-	unsigned int seq;
-	ktime_t total;
-
-	do {
-		seq = read_seqcount_begin(&stats->lock);
-		total = __intel_engine_get_busy_time(engine, now);
-	} while (read_seqcount_retry(&stats->lock, seq));
-
-	return total;
+	return engine->busyness(engine, now);
 }
 
 struct intel_context *
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 316d8551d22f..4eb09d07419a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -284,6 +284,28 @@ struct intel_engine_execlists_stats {
 	ktime_t start;
 };
 
+struct intel_engine_guc_stats {
+	/**
+	 * @running: Active state of the engine when busyness was last sampled.
+	 */
+	bool running;
+
+	/**
+	 * @prev_total: Previous value of total runtime clock cycles.
+	 */
+	u32 prev_total;
+
+	/**
+	 * @total_gt_clks: Total gt clock cycles this engine was busy.
+	 */
+	u64 total_gt_clks;
+
+	/**
+	 * @start_gt_clk: GT clock time of last idle to active transition.
+	 */
+	u64 start_gt_clk;
+};
+
 struct intel_engine_cs {
 	struct drm_i915_private *i915;
 	struct intel_gt *gt;
@@ -459,6 +481,12 @@ struct intel_engine_cs {
 	void		(*add_active_request)(struct i915_request *rq);
 	void		(*remove_active_request)(struct i915_request *rq);
 
+	/*
+	 * Get engine busyness and the time at which the busyness was sampled.
+	 */
+	ktime_t		(*busyness)(struct intel_engine_cs *engine,
+				    ktime_t *now);
+
 	struct intel_engine_execlists execlists;
 
 	/*
@@ -508,7 +536,10 @@ struct intel_engine_cs {
 	u32 (*get_cmd_length_mask)(u32 cmd_header);
 
 	struct {
-		struct intel_engine_execlists_stats execlists;
+		union {
+			struct intel_engine_execlists_stats execlists;
+			struct intel_engine_guc_stats guc;
+		};
 
 		/**
 		 * @rps: Utilisation at last RPS sampling.
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 7147fe80919e..6bece961eeb1 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -3292,6 +3292,38 @@ static void execlists_release(struct intel_engine_cs *engine)
 	lrc_fini_wa_ctx(engine);
 }
 
+static ktime_t __execlists_engine_busyness(struct intel_engine_cs *engine,
+					   ktime_t *now)
+{
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
+	ktime_t total = stats->total;
+
+	/*
+	 * If the engine is executing something at the moment
+	 * add it to the total.
+	 */
+	*now = ktime_get();
+	if (READ_ONCE(stats->active))
+		total = ktime_add(total, ktime_sub(*now, stats->start));
+
+	return total;
+}
+
+static ktime_t execlists_engine_busyness(struct intel_engine_cs *engine,
+					 ktime_t *now)
+{
+	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
+	unsigned int seq;
+	ktime_t total;
+
+	do {
+		seq = read_seqcount_begin(&stats->lock);
+		total = __execlists_engine_busyness(engine, now);
+	} while (read_seqcount_retry(&stats->lock, seq));
+
+	return total;
+}
+
 static void
 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 {
@@ -3348,6 +3380,8 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 		engine->emit_bb_start = gen8_emit_bb_start;
 	else
 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
+
+	engine->busyness = execlists_engine_busyness;
 }
 
 static void logical_ring_default_irqs(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 524eaf678790..b4a8594bc46c 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -86,6 +86,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
 	intel_rc6_unpark(&gt->rc6);
 	intel_rps_unpark(&gt->rps);
 	i915_pmu_gt_unparked(i915);
+	intel_guc_busyness_unpark(gt);
 
 	intel_gt_unpark_requests(gt);
 	runtime_begin(gt);
@@ -104,6 +105,7 @@ static int __gt_park(struct intel_wakeref *wf)
 	runtime_end(gt);
 	intel_gt_park_requests(gt);
 
+	intel_guc_busyness_park(gt);
 	i915_vma_parked(gt);
 	i915_pmu_gt_parked(i915);
 	intel_rps_park(&gt->rps);
diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
index 8ff582222aff..ff1311d4beff 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
@@ -143,6 +143,7 @@ enum intel_guc_action {
 	INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506,
 	INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
 	INTEL_GUC_ACTION_RESET_CLIENT = 0x5507,
+	INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
 	INTEL_GUC_ACTION_LIMIT
 };
 
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index 5dd174babf7a..22c30dbdf63a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -104,6 +104,8 @@ struct intel_guc {
 	u32 ads_regset_size;
 	/** @ads_golden_ctxt_size: size of the golden contexts in the ADS */
 	u32 ads_golden_ctxt_size;
+	/** @ads_engine_usage_size: size of engine usage in the ADS */
+	u32 ads_engine_usage_size;
 
 	/** @lrc_desc_pool: object allocated to hold the GuC LRC descriptor pool */
 	struct i915_vma *lrc_desc_pool;
@@ -138,6 +140,30 @@ struct intel_guc {
 
 	/** @send_mutex: used to serialize the intel_guc_send actions */
 	struct mutex send_mutex;
+
+	struct {
+		/**
+		 * @lock: Lock protecting the below fields and the engine stats.
+		 */
+		spinlock_t lock;
+
+		/**
+		 * @gt_stamp: 64 bit extended value of the GT timestamp.
+		 */
+		u64 gt_stamp;
+
+		/**
+		 * @ping_delay: Period for polling the GT timestamp for
+		 * overflow.
+		 */
+		unsigned long ping_delay;
+
+		/**
+		 * @work: Periodic work to adjust GT timestamp, engine and
+		 * context usage for overflows.
+		 */
+		struct delayed_work work;
+	} timestamp;
 };
 
 static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
index 2c6ea64af7ec..ca9ab53999d5 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
@@ -26,6 +26,8 @@
  *      | guc_policies                          |
  *      +---------------------------------------+
  *      | guc_gt_system_info                    |
+ *      +---------------------------------------+
+ *      | guc_engine_usage                      |
  *      +---------------------------------------+ <== static
  *      | guc_mmio_reg[countA] (engine 0.0)     |
  *      | guc_mmio_reg[countB] (engine 0.1)     |
@@ -47,6 +49,7 @@ struct __guc_ads_blob {
 	struct guc_ads ads;
 	struct guc_policies policies;
 	struct guc_gt_system_info system_info;
+	struct guc_engine_usage engine_usage;
 	/* From here on, location is dynamic! Refer to above diagram. */
 	struct guc_mmio_reg regset[0];
 } __packed;
@@ -628,3 +631,21 @@ void intel_guc_ads_reset(struct intel_guc *guc)
 
 	guc_ads_private_data_reset(guc);
 }
+
+u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
+{
+	struct __guc_ads_blob *blob = guc->ads_blob;
+	u32 base = intel_guc_ggtt_offset(guc, guc->ads_vma);
+	u32 offset = base + ptr_offset(blob, engine_usage);
+
+	return offset;
+}
+
+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine)
+{
+	struct intel_guc *guc = &engine->gt->uc.guc;
+	struct __guc_ads_blob *blob = guc->ads_blob;
+	u8 guc_class = engine_class_to_guc_class(engine->class);
+
+	return &blob->engine_usage.engines[guc_class][engine->instance];
+}
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
index 3d85051d57e4..e74c110facff 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h
@@ -6,8 +6,11 @@
 #ifndef _INTEL_GUC_ADS_H_
 #define _INTEL_GUC_ADS_H_
 
+#include <linux/types.h>
+
 struct intel_guc;
 struct drm_printer;
+struct intel_engine_cs;
 
 int intel_guc_ads_create(struct intel_guc *guc);
 void intel_guc_ads_destroy(struct intel_guc *guc);
@@ -15,5 +18,7 @@ void intel_guc_ads_init_late(struct intel_guc *guc);
 void intel_guc_ads_reset(struct intel_guc *guc);
 void intel_guc_ads_print_policy_info(struct intel_guc *guc,
 				     struct drm_printer *p);
+struct guc_engine_usage_record *intel_guc_engine_usage(struct intel_engine_cs *engine);
+u32 intel_guc_engine_usage_offset(struct intel_guc *guc);
 
 #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
index fa4be13c8854..7c9c081670fc 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
@@ -294,6 +294,19 @@ struct guc_ads {
 	u32 reserved[15];
 } __packed;
 
+/* Engine usage stats */
+struct guc_engine_usage_record {
+	u32 current_context_index;
+	u32 last_switch_in_stamp;
+	u32 reserved0;
+	u32 total_runtime;
+	u32 reserved1[4];
+} __packed;
+
+struct guc_engine_usage {
+	struct guc_engine_usage_record engines[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS];
+} __packed;
+
 /* GuC logging structures */
 
 enum guc_log_buffer_type {
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index ba0de35f6323..f0c27ae2cecc 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -12,6 +12,7 @@
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_clock_utils.h"
 #include "gt/intel_gt_irq.h"
 #include "gt/intel_gt_pm.h"
 #include "gt/intel_gt_requests.h"
@@ -20,6 +21,7 @@
 #include "gt/intel_mocs.h"
 #include "gt/intel_ring.h"
 
+#include "intel_guc_ads.h"
 #include "intel_guc_submission.h"
 
 #include "i915_drv.h"
@@ -750,6 +752,233 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
 	xa_unlock_irqrestore(&guc->context_lookup, flags);
 }
 
+/*
+ * GuC stores busyness stats for each engine at context in/out boundaries. A
+ * context 'in' logs execution start time, 'out' adds in -> out delta to total.
+ * i915/kmd accesses 'start', 'total' and 'context id' from memory shared with
+ * GuC.
+ *
+ * __i915_pmu_event_read samples engine busyness. When sampling, if context id
+ * is valid (!= ~0) and start is non-zero, the engine is considered to be
+ * active. For an active engine total busyness = total + (now - start), where
+ * 'now' is the time at which the busyness is sampled. For inactive engine,
+ * total busyness = total.
+ *
+ * All times are captured from GUCPMTIMESTAMP reg and are in gt clock domain.
+ *
+ * The start and total values provided by GuC are 32 bits and wrap around in a
+ * few minutes. Since perf pmu provides busyness as 64 bit monotonically
+ * increasing ns values, there is a need for this implementation to account for
+ * overflows and extend the GuC provided values to 64 bits before returning
+ * busyness to the user. In order to do that, a worker runs periodically at
+ * frequency = 1/8th the time it takes for the timestamp to wrap (i.e. once in
+ * 27 seconds for a gt clock frequency of 19.2 MHz).
+ */
+
+#define WRAP_TIME_CLKS U32_MAX
+#define POLL_TIME_CLKS (WRAP_TIME_CLKS >> 3)
+
+static void
+__extend_last_switch(struct intel_guc *guc, u64 *prev_start, u32 new_start)
+{
+	u32 gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
+	u32 gt_stamp_last = lower_32_bits(guc->timestamp.gt_stamp);
+
+	if (new_start == lower_32_bits(*prev_start))
+		return;
+
+	if (new_start < gt_stamp_last &&
+	    (new_start - gt_stamp_last) <= POLL_TIME_CLKS)
+		gt_stamp_hi++;
+
+	if (new_start > gt_stamp_last &&
+	    (gt_stamp_last - new_start) <= POLL_TIME_CLKS && gt_stamp_hi)
+		gt_stamp_hi--;
+
+	*prev_start = ((u64)gt_stamp_hi << 32) | new_start;
+}
+
+static void guc_update_engine_gt_clks(struct intel_engine_cs *engine)
+{
+	struct guc_engine_usage_record *rec = intel_guc_engine_usage(engine);
+	struct intel_engine_guc_stats *stats = &engine->stats.guc;
+	struct intel_guc *guc = &engine->gt->uc.guc;
+	u32 last_switch = rec->last_switch_in_stamp;
+	u32 ctx_id = rec->current_context_index;
+	u32 total = rec->total_runtime;
+
+	lockdep_assert_held(&guc->timestamp.lock);
+
+	stats->running = ctx_id != ~0U && last_switch;
+	if (stats->running)
+		__extend_last_switch(guc, &stats->start_gt_clk, last_switch);
+
+	/*
+	 * Instead of adjusting the total for overflow, just add the
+	 * difference from previous sample stats->total_gt_clks
+	 */
+	if (total && total != ~0U) {
+		stats->total_gt_clks += (u32)(total - stats->prev_total);
+		stats->prev_total = total;
+	}
+}
+
+static void guc_update_pm_timestamp(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	u32 gt_stamp_now, gt_stamp_hi;
+
+	lockdep_assert_held(&guc->timestamp.lock);
+
+	gt_stamp_hi = upper_32_bits(guc->timestamp.gt_stamp);
+	gt_stamp_now = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
+
+	if (gt_stamp_now < lower_32_bits(guc->timestamp.gt_stamp))
+		gt_stamp_hi++;
+
+	guc->timestamp.gt_stamp = ((u64) gt_stamp_hi << 32) | gt_stamp_now;
+}
+
+/*
+ * Unlike the execlist mode of submission total and active times are in terms of
+ * gt clocks. The *now parameter is retained to return the cpu time at which the
+ * busyness was sampled.
+ */
+static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now)
+{
+	struct intel_engine_guc_stats *stats = &engine->stats.guc;
+	struct intel_gt *gt = engine->gt;
+	struct intel_guc *guc = &gt->uc.guc;
+	unsigned long flags;
+	u64 total;
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	*now = ktime_get();
+
+	/*
+	 * The active busyness depends on start_gt_clk and gt_stamp.
+	 * gt_stamp is updated by i915 only when gt is awake and the
+	 * start_gt_clk is derived from GuC state. To get a consistent
+	 * view of activity, we query the GuC state only if gt is awake.
+	 */
+	if (intel_gt_pm_get_if_awake(gt)) {
+		guc_update_engine_gt_clks(engine);
+		guc_update_pm_timestamp(guc);
+		intel_gt_pm_put_async(gt);
+	}
+
+	total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks);
+	if (stats->running) {
+		u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk;
+
+		total += intel_gt_clock_interval_to_ns(gt, clk);
+	}
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+
+	return ns_to_ktime(total);
+}
+
+static void __reset_guc_busyness_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	unsigned long flags;
+
+	cancel_delayed_work_sync(&guc->timestamp.work);
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	guc_update_pm_timestamp(guc);
+	for_each_engine(engine, gt, id) {
+		guc_update_engine_gt_clks(engine);
+		engine->stats.guc.prev_total = 0;
+	}
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
+static void __update_guc_busyness_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	unsigned long flags;
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	guc_update_pm_timestamp(guc);
+	for_each_engine(engine, gt, id)
+		guc_update_engine_gt_clks(engine);
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
+static void guc_timestamp_ping(struct work_struct *wrk)
+{
+	struct intel_guc *guc = container_of(wrk, typeof(*guc),
+					     timestamp.work.work);
+	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
+	struct intel_gt *gt = guc_to_gt(guc);
+	intel_wakeref_t wakeref;
+
+	if (uc->reset_in_progress)
+		return;
+
+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
+		__update_guc_busyness_stats(guc);
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+}
+
+static int guc_action_enable_usage_stats(struct intel_guc *guc)
+{
+	u32 offset = intel_guc_engine_usage_offset(guc);
+	u32 action[] = {
+		INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF,
+		offset,
+		0,
+	};
+
+	return intel_guc_send(guc, action, ARRAY_SIZE(action));
+}
+
+static void guc_init_engine_stats(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	intel_wakeref_t wakeref;
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+
+	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref) {
+		int ret = guc_action_enable_usage_stats(guc);
+
+		if (ret)
+			drm_err(&gt->i915->drm,
+				"Failed to enable usage stats: %d!\n", ret);
+	}
+}
+
+void intel_guc_busyness_park(struct intel_gt *gt)
+{
+	struct intel_guc *guc = &gt->uc.guc;
+
+	cancel_delayed_work(&guc->timestamp.work);
+	__update_guc_busyness_stats(guc);
+}
+
+void intel_guc_busyness_unpark(struct intel_gt *gt)
+{
+	struct intel_guc *guc = &gt->uc.guc;
+
+	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
+			 guc->timestamp.ping_delay);
+}
+
 static inline bool
 submission_disabled(struct intel_guc *guc)
 {
@@ -809,6 +1038,7 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc)
 	intel_gt_park_heartbeats(guc_to_gt(guc));
 	disable_submission(guc);
 	guc->interrupts.disable(guc);
+	__reset_guc_busyness_stats(guc);
 
 	/* Flush IRQ handler */
 	spin_lock_irq(&guc_to_gt(guc)->irq_lock);
@@ -1132,6 +1362,7 @@ void intel_guc_submission_reset_finish(struct intel_guc *guc)
  */
 int intel_guc_submission_init(struct intel_guc *guc)
 {
+	struct intel_gt *gt = guc_to_gt(guc);
 	int ret;
 
 	if (guc->lrc_desc_pool)
@@ -1152,6 +1383,10 @@ int intel_guc_submission_init(struct intel_guc *guc)
 	INIT_LIST_HEAD(&guc->guc_id_list);
 	ida_init(&guc->guc_ids);
 
+	spin_lock_init(&guc->timestamp.lock);
+	INIT_DELAYED_WORK(&guc->timestamp.work, guc_timestamp_ping);
+	guc->timestamp.ping_delay = (POLL_TIME_CLKS / gt->clock_frequency + 1) * HZ;
+
 	return 0;
 }
 
@@ -2606,7 +2841,9 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
 		engine->emit_flush = gen12_emit_flush_xcs;
 	}
 	engine->set_default_submission = guc_set_default_submission;
+	engine->busyness = guc_engine_busyness;
 
+	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
 	engine->flags |= I915_ENGINE_HAS_PREEMPTION;
 	engine->flags |= I915_ENGINE_HAS_TIMESLICES;
 
@@ -2705,6 +2942,7 @@ int intel_guc_submission_setup(struct intel_engine_cs *engine)
 void intel_guc_submission_enable(struct intel_guc *guc)
 {
 	guc_init_lrc_mapping(guc);
+	guc_init_engine_stats(guc);
 }
 
 void intel_guc_submission_disable(struct intel_guc *guc)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
index c7ef44fa0c36..5a95a9f0a8e3 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
@@ -28,6 +28,8 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
 void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
 				    struct i915_request *hung_rq,
 				    struct drm_printer *m);
+void intel_guc_busyness_park(struct intel_gt *gt);
+void intel_guc_busyness_unpark(struct intel_gt *gt);
 
 bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
 
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index a897f4abea0c..9aee08425382 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -2664,6 +2664,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   RING_WAIT		(1 << 11) /* gen3+, PRBx_CTL */
 #define   RING_WAIT_SEMAPHORE	(1 << 10) /* gen6+ */
 
+#define GUCPMTIMESTAMP          _MMIO(0xC3E8)
+
 /* There are 16 64-bit CS General Purpose Registers per-engine on Gen8+ */
 #define GEN8_RING_CS_GPR(base, n)	_MMIO((base) + 0x600 + (n) * 8)
 #define GEN8_RING_CS_GPR_UDW(base, n)	_MMIO((base) + 0x600 + (n) * 8 + 4)
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2022-10-22  0:21 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-15 23:47 [PATCH 1/2] drm/i915/pmu: Add a name to the execlists stats Umesh Nerlige Ramappa
2021-10-15 23:47 ` [Intel-gfx] " Umesh Nerlige Ramappa
2021-10-15 23:47 ` [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu Umesh Nerlige Ramappa
2021-10-15 23:47   ` [Intel-gfx] " Umesh Nerlige Ramappa
2021-10-18  7:58   ` Tvrtko Ursulin
2021-10-18  7:58     ` [Intel-gfx] " Tvrtko Ursulin
2021-10-18 18:35     ` Umesh Nerlige Ramappa
2021-10-18 18:35       ` [Intel-gfx] " Umesh Nerlige Ramappa
2021-10-18 20:35       ` Umesh Nerlige Ramappa
2021-10-18 20:35         ` [Intel-gfx] " Umesh Nerlige Ramappa
2021-10-19  8:32       ` Tvrtko Ursulin
2021-10-19  8:32         ` [Intel-gfx] " Tvrtko Ursulin
2021-10-20  4:41         ` Umesh Nerlige Ramappa
2021-10-20  4:41           ` [Intel-gfx] " Umesh Nerlige Ramappa
2021-10-16  1:22 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/2] drm/i915/pmu: Add a name to the execlists stats Patchwork
2021-10-16  1:24 ` [Intel-gfx] ✗ Fi.CI.SPARSE: " Patchwork
2021-10-16  2:06 ` [Intel-gfx] ✗ Fi.CI.BAT: failure " Patchwork
  -- strict thread matches above, loose matches on Subject: below --
2021-10-27  0:48 [PATCH 1/2] " Umesh Nerlige Ramappa
2021-10-27  0:48 ` [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu Umesh Nerlige Ramappa
2021-10-27 20:02   ` Matthew Brost
2022-10-21  8:42   ` Tvrtko Ursulin
2022-10-22  0:21     ` Umesh Nerlige Ramappa
2021-10-15  1:18 [PATCH 1/2] drm/i915/pmu: Add a name to the execlists stats Umesh Nerlige Ramappa
2021-10-15  1:18 ` [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu Umesh Nerlige Ramappa
2021-10-13  0:56 [PATCH 1/2] drm/i915/pmu: Add a name to the execlists stats Umesh Nerlige Ramappa
2021-10-13  0:56 ` [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu Umesh Nerlige Ramappa
2021-10-13 16:06   ` Tvrtko Ursulin
2021-10-13 16:27     ` Umesh Nerlige Ramappa
2021-10-14  8:21   ` Tvrtko Ursulin
2021-10-15  1:01     ` Umesh Nerlige Ramappa
2021-10-07 22:55 [PATCH 1/2] drm/i915/pmu: Add a name to the execlists stats Umesh Nerlige Ramappa
2021-10-07 22:55 ` [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu Umesh Nerlige Ramappa
2021-10-11 11:41   ` Tvrtko Ursulin
2021-10-11 20:08     ` Umesh Nerlige Ramappa
2021-10-12  8:26       ` Tvrtko Ursulin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.