All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/xe: Convert xe_device_snapshot to a standalone snapshot
@ 2024-01-30 22:37 Rodrigo Vivi
  2024-01-30 22:37 ` [PATCH 2/2] drm/xe: Make a standalone snapshot to that survives unbind Rodrigo Vivi
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Rodrigo Vivi @ 2024-01-30 22:37 UTC (permalink / raw)
  To: intel-xe; +Cc: Rodrigo Vivi

devcoredump direction is that the snapshot could be taken
even after the driver unbind. At that point the xe device
will be gone. So, let's convert to a proper snapshot.

Cc: José Roberto de Souza <jose.souza@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/xe/xe_devcoredump.c       |  5 +-
 drivers/gpu/drm/xe/xe_devcoredump_types.h |  3 +-
 drivers/gpu/drm/xe/xe_device.c            | 80 ++++++++++++++++++++---
 drivers/gpu/drm/xe/xe_device.h            |  6 +-
 drivers/gpu/drm/xe/xe_device_types.h      | 13 ++++
 5 files changed, 94 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index e701f0d07b67..30e7edbb8b6f 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -63,7 +63,6 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
 				   size_t count, void *data, size_t datalen)
 {
 	struct xe_devcoredump *coredump = data;
-	struct xe_device *xe = coredump_to_xe(coredump);
 	struct xe_devcoredump_snapshot *ss;
 	struct drm_printer p;
 	struct drm_print_iterator iter;
@@ -90,7 +89,7 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
 	drm_printf(&p, "Snapshot time: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
 	ts = ktime_to_timespec64(ss->boot_time);
 	drm_printf(&p, "Uptime: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
-	xe_device_snapshot_print(xe, &p);
+	xe_device_snapshot_print(ss->xe, &p);
 
 	drm_printf(&p, "\n**** GuC CT ****\n");
 	xe_guc_ct_snapshot_print(coredump->snapshot.ct, &p);
@@ -114,6 +113,7 @@ static void xe_devcoredump_free(void *data)
 	if (!data || !coredump_to_xe(coredump))
 		return;
 
+	xe_device_snapshot_free(coredump->snapshot.xe);
 	xe_guc_ct_snapshot_free(coredump->snapshot.ct);
 	xe_guc_exec_queue_snapshot_free(coredump->snapshot.ge);
 	for (i = 0; i < XE_NUM_HW_ENGINES; i++)
@@ -153,6 +153,7 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
 
 	xe_force_wake_get(gt_to_fw(q->gt), XE_FORCEWAKE_ALL);
 
+	coredump->snapshot.xe = xe_device_snapshot_capture(gt_to_xe(q->gt));
 	coredump->snapshot.ct = xe_guc_ct_snapshot_capture(&guc->ct, true);
 	coredump->snapshot.ge = xe_guc_exec_queue_snapshot_capture(job);
 
diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
index 50106efcbc29..8950b1ca7456 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
+++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
@@ -26,7 +26,8 @@ struct xe_devcoredump_snapshot {
 	/** @boot_time:  Relative boot time so the uptime can be calculated. */
 	ktime_t boot_time;
 
-	/* GuC snapshots */
+	/** @xe: Xe Device Info snapshot */
+	struct xe_device_snapshot *xe;
 	/** @ct: GuC CT snapshot */
 	struct xe_guc_ct_snapshot *ct;
 	/** @ge: Guc Engine snapshot */
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 6faa7865b1aa..6999bb7aea42 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -728,22 +728,84 @@ void xe_device_mem_access_put(struct xe_device *xe)
 	xe_assert(xe, ref >= 0);
 }
 
-void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p)
+/**
+ * xe_device_snapshot_capture - Take a quick snapshot of the Xe Device info.
+ * @xe: faulty Xe device.
+ *
+ * This can be printed out in a later stage like during dev_coredump
+ * analysis.
+ *
+ * Returns: a Xe device snapshot that must be freed by the
+ * caller, using `xe_device_snapshot_free`.
+ */
+struct xe_device_snapshot *
+xe_device_snapshot_capture(struct xe_device *xe)
 {
+	struct xe_device_snapshot *snapshot;
 	struct xe_gt *gt;
 	u8 id;
 
-	drm_printf(p, "PCI ID: 0x%04x\n", xe->info.devid);
-	drm_printf(p, "PCI revision: 0x%02x\n", xe->info.revid);
+	snapshot = kzalloc(sizeof(*snapshot), GFP_ATOMIC);
+
+	if (!snapshot) {
+		drm_err(&xe->drm, "Skipping Xe Device snapshot.\n");
+		return NULL;
+	}
 
+	snapshot->devid = xe->info.devid;
+	snapshot->revid = xe->info.revid;
+	snapshot->gt_count = xe->info.gt_count;
+
+	snapshot->gt = kmalloc_array(xe->info.gt_count,
+				    sizeof(struct gt_info_snapshot), GFP_ATOMIC);
 	for_each_gt(gt, xe, id) {
-		drm_printf(p, "GT id: %u\n", id);
+		snapshot->gt[id].type = gt->info.type;
+		snapshot->gt[id].gmdid = gt->info.gmdid;
+		snapshot->gt[id].reference_clock = gt->info.reference_clock;
+	}
+
+	return snapshot;
+}
+
+/**
+ * xe_decice_snapshot_print - Print out a given Xe device snapshot.
+ * @snapshot: Xe device snapshot object.
+ * @p: drm_printer where it will be printed out.
+ *
+ * This function prints out a given GuC Submit Engine snapshot object.
+ */
+void xe_device_snapshot_print(struct xe_device_snapshot *ss,
+			      struct drm_printer *p)
+{
+	int i;
+
+	drm_printf(p, "PCI ID: 0x%04x\n", ss->devid);
+	drm_printf(p, "PCI revision: 0x%02x\n", ss->revid);
+
+	for(i = 0; i < ss->gt_count; i++) {
+		drm_printf(p, "GT id: %u\n", i);
 		drm_printf(p, "\tType: %s\n",
-			   gt->info.type == XE_GT_TYPE_MAIN ? "main" : "media");
+			   ss->gt[i].type == XE_GT_TYPE_MAIN ? "main" : "media");
 		drm_printf(p, "\tIP ver: %u.%u.%u\n",
-			   REG_FIELD_GET(GMD_ID_ARCH_MASK, gt->info.gmdid),
-			   REG_FIELD_GET(GMD_ID_RELEASE_MASK, gt->info.gmdid),
-			   REG_FIELD_GET(GMD_ID_REVID, gt->info.gmdid));
-		drm_printf(p, "\tCS reference clock: %u\n", gt->info.reference_clock);
+			   REG_FIELD_GET(GMD_ID_ARCH_MASK, ss->gt[i].gmdid),
+			   REG_FIELD_GET(GMD_ID_RELEASE_MASK, ss->gt[i].gmdid),
+			   REG_FIELD_GET(GMD_ID_REVID, ss->gt[i].gmdid));
+		drm_printf(p, "\tCS reference clock: %u\n", ss->gt[i].reference_clock);
 	}
 }
+
+/**
+ * xe_device_snapshot_free - Free all allocated objects for a given xe device snapshot.
+ * @snapshot: Xe device snapshot object.
+ *
+ * This function free all the memory that needed to be allocated at capture
+ * time.
+ */
+void xe_device_snapshot_free(struct xe_device_snapshot *snapshot)
+{
+	if (!snapshot)
+		return;
+
+	kfree(snapshot->gt);
+	kfree(snapshot);
+}
diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
index 270124da1e00..409bfbcffb4a 100644
--- a/drivers/gpu/drm/xe/xe_device.h
+++ b/drivers/gpu/drm/xe/xe_device.h
@@ -175,6 +175,10 @@ static inline bool xe_device_has_memirq(struct xe_device *xe)
 
 u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size);
 
-void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p);
+struct xe_device_snapshot *
+xe_device_snapshot_capture(struct xe_device *xe);
+void xe_device_snapshot_print(struct xe_device_snapshot *snapshot,
+			      struct drm_printer *p);
+void xe_device_snapshot_free(struct xe_device_snapshot *snapshot);
 
 #endif
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index eb2b806a1d23..50dac1a5b053 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -540,6 +540,19 @@ struct xe_device {
 #endif
 };
 
+struct gt_info_snapshot {
+	enum xe_gt_type type;
+	u32 gmdid;
+	u32 reference_clock;
+};
+
+struct xe_device_snapshot {
+	u16 devid;
+	u8 revid;
+	u8 gt_count;
+	struct gt_info_snapshot *gt;
+};
+
 /**
  * struct xe_file - file handle for XE driver
  */
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/2] drm/xe: Make a standalone snapshot to that survives unbind
  2024-01-30 22:37 [PATCH 1/2] drm/xe: Convert xe_device_snapshot to a standalone snapshot Rodrigo Vivi
@ 2024-01-30 22:37 ` Rodrigo Vivi
  2024-01-31  8:58   ` Jani Nikula
  2024-01-31  5:02 ` ✗ CI.Patch_applied: failure for series starting with [1/2] drm/xe: Convert xe_device_snapshot to a standalone snapshot Patchwork
  2024-01-31 13:44 ` [PATCH 1/2] " Souza, Jose
  2 siblings, 1 reply; 5+ messages in thread
From: Rodrigo Vivi @ 2024-01-30 22:37 UTC (permalink / raw)
  To: intel-xe; +Cc: Rodrigo Vivi

Instead of having the coredump embedded to the xe device,
let's dynamically allocate that and remove only when
requested by devcoredump.

This will allow the 'data' to be read even when the xe_device
is already gone at unbind for instance.

Of course, the module cannot be unloaded, but this is
guaranteed by devcoredump holding the xe module reference.
Only after devcoredump device deletion is that this reference
will be put and the xe module can be removed.

Our scripts and IGT helpers neeed to be adjusted to write
something to the data file before rmmod so the driver can
be properly removed or reloaded.

Cc: José Roberto de Souza <jose.souza@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/xe/xe_devcoredump.c  | 32 ++++++++++------------------
 drivers/gpu/drm/xe/xe_device_types.h |  2 +-
 2 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index 30e7edbb8b6f..64886773b70b 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -49,11 +49,6 @@
 
 #ifdef CONFIG_DEV_COREDUMP
 
-static struct xe_device *coredump_to_xe(const struct xe_devcoredump *coredump)
-{
-	return container_of(coredump, struct xe_device, devcoredump);
-}
-
 static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
 {
 	return &q->gt->uc.guc;
@@ -69,10 +64,6 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
 	struct timespec64 ts;
 	int i;
 
-	/* Our device is gone already... */
-	if (!data || !coredump_to_xe(coredump))
-		return -ENODEV;
-
 	iter.data = buffer;
 	iter.offset = 0;
 	iter.start = offset;
@@ -109,10 +100,6 @@ static void xe_devcoredump_free(void *data)
 	struct xe_devcoredump *coredump = data;
 	int i;
 
-	/* Our device is gone. Nothing to do... */
-	if (!data || !coredump_to_xe(coredump))
-		return;
-
 	xe_device_snapshot_free(coredump->snapshot.xe);
 	xe_guc_ct_snapshot_free(coredump->snapshot.ct);
 	xe_guc_exec_queue_snapshot_free(coredump->snapshot.ge);
@@ -121,8 +108,8 @@ static void xe_devcoredump_free(void *data)
 			xe_hw_engine_snapshot_free(coredump->snapshot.hwe[i]);
 
 	coredump->captured = false;
-	drm_info(&coredump_to_xe(coredump)->drm,
-		 "Xe device coredump has been deleted.\n");
+	coredump = NULL;
+	kfree(coredump);
 }
 
 static void devcoredump_snapshot(struct xe_devcoredump *coredump,
@@ -181,21 +168,24 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
 void xe_devcoredump(struct xe_sched_job *job)
 {
 	struct xe_device *xe = gt_to_xe(job->q->gt);
-	struct xe_devcoredump *coredump = &xe->devcoredump;
 
-	if (coredump->captured) {
-		drm_dbg(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n");
+	if (xe->devcoredump && xe->devcoredump->captured) {
+		drm_info(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n");
 		return;
 	}
 
-	coredump->captured = true;
-	devcoredump_snapshot(coredump, job);
+	xe->devcoredump = kzalloc(sizeof(*xe->devcoredump), GFP_KERNEL);
+	if (!xe->devcoredump)
+		drm_err(&xe->drm, "devcoredump failed\n");
+
+	xe->devcoredump->captured = true;
+	devcoredump_snapshot(xe->devcoredump, job);
 
 	drm_info(&xe->drm, "Xe device coredump has been created\n");
 	drm_info(&xe->drm, "Check your /sys/class/drm/card%d/device/devcoredump/data\n",
 		 xe->drm.primary->index);
 
-	dev_coredumpm(xe->drm.dev, THIS_MODULE, coredump, 0, GFP_KERNEL,
+	dev_coredumpm(xe->drm.dev, THIS_MODULE, xe->devcoredump, 0, GFP_KERNEL,
 		      xe_devcoredump_read, xe_devcoredump_free);
 }
 #endif
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index 50dac1a5b053..4372f5cc98b6 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -214,7 +214,7 @@ struct xe_device {
 	struct drm_device drm;
 
 	/** @devcoredump: device coredump */
-	struct xe_devcoredump devcoredump;
+	struct xe_devcoredump *devcoredump;
 
 	/** @info: device info */
 	struct intel_device_info {
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* ✗ CI.Patch_applied: failure for series starting with [1/2] drm/xe: Convert xe_device_snapshot to a standalone snapshot
  2024-01-30 22:37 [PATCH 1/2] drm/xe: Convert xe_device_snapshot to a standalone snapshot Rodrigo Vivi
  2024-01-30 22:37 ` [PATCH 2/2] drm/xe: Make a standalone snapshot to that survives unbind Rodrigo Vivi
@ 2024-01-31  5:02 ` Patchwork
  2024-01-31 13:44 ` [PATCH 1/2] " Souza, Jose
  2 siblings, 0 replies; 5+ messages in thread
From: Patchwork @ 2024-01-31  5:02 UTC (permalink / raw)
  To: Rodrigo Vivi; +Cc: intel-xe

== Series Details ==

Series: series starting with [1/2] drm/xe: Convert xe_device_snapshot to a standalone snapshot
URL   : https://patchwork.freedesktop.org/series/129345/
State : failure

== Summary ==

=== Applying kernel patches on branch 'drm-tip' with base: ===
Base commit: 3d244fce9 drm-tip: 2024y-01m-30d-19h-55m-23s UTC integration manifest
=== git am output follows ===
error: patch failed: drivers/gpu/drm/xe/xe_devcoredump.c:114
error: drivers/gpu/drm/xe/xe_devcoredump.c: patch does not apply
error: patch failed: drivers/gpu/drm/xe/xe_device.c:728
error: drivers/gpu/drm/xe/xe_device.c: patch does not apply
error: patch failed: drivers/gpu/drm/xe/xe_device.h:175
error: drivers/gpu/drm/xe/xe_device.h: patch does not apply
hint: Use 'git am --show-current-patch' to see the failed patch
Applying: drm/xe: Convert xe_device_snapshot to a standalone snapshot
Patch failed at 0001 drm/xe: Convert xe_device_snapshot to a standalone snapshot
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 2/2] drm/xe: Make a standalone snapshot to that survives unbind
  2024-01-30 22:37 ` [PATCH 2/2] drm/xe: Make a standalone snapshot to that survives unbind Rodrigo Vivi
@ 2024-01-31  8:58   ` Jani Nikula
  0 siblings, 0 replies; 5+ messages in thread
From: Jani Nikula @ 2024-01-31  8:58 UTC (permalink / raw)
  To: Rodrigo Vivi, intel-xe; +Cc: Lucas De Marchi, Rodrigo Vivi

On Tue, 30 Jan 2024, Rodrigo Vivi <rodrigo.vivi@intel.com> wrote:
> Instead of having the coredump embedded to the xe device,
> let's dynamically allocate that and remove only when
> requested by devcoredump.

[snip]

> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index 50dac1a5b053..4372f5cc98b6 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -214,7 +214,7 @@ struct xe_device {
>  	struct drm_device drm;
>  
>  	/** @devcoredump: device coredump */
> -	struct xe_devcoredump devcoredump;
> +	struct xe_devcoredump *devcoredump;

Yes! I *much* prefer having pointer members in struct xe_device over
embedding structs, for anything where pointer chasing is not a
performance issue. Which is most things, really. Ditto in struct
drm_i915_private, but converting now is a lot of churn.

Anyway, with dynamic allocation you also don't need the type definition
here. You can just forward declare, and drop the include. This file here
is included by absolutely everywhere, and by proxy, so is
xe_devcoredump_types.h. Modify xe_devcoredump_types.h, and you need to
rebuild the entire driver, including all of display. It adds up.

For added bonus, move the struct definitions inside xe_devcoredump.c,
and make struct xe_devcoredump an opaque pointer. This is one of the few
things C has to enforce use of interfaces to access data. There's no
reason for anyone outside of xe_devcoredump.c to directly access
xe->devcoredump. And making the pointer opaque guarantees nobody
bypasses the interfaces in a whim. Those things happen, "I just quickly
...", and you end up in a mess.


BR,
Jani.


-- 
Jani Nikula, Intel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] drm/xe: Convert xe_device_snapshot to a standalone snapshot
  2024-01-30 22:37 [PATCH 1/2] drm/xe: Convert xe_device_snapshot to a standalone snapshot Rodrigo Vivi
  2024-01-30 22:37 ` [PATCH 2/2] drm/xe: Make a standalone snapshot to that survives unbind Rodrigo Vivi
  2024-01-31  5:02 ` ✗ CI.Patch_applied: failure for series starting with [1/2] drm/xe: Convert xe_device_snapshot to a standalone snapshot Patchwork
@ 2024-01-31 13:44 ` Souza, Jose
  2 siblings, 0 replies; 5+ messages in thread
From: Souza, Jose @ 2024-01-31 13:44 UTC (permalink / raw)
  To: intel-xe, Vivi,  Rodrigo

On Tue, 2024-01-30 at 17:37 -0500, Rodrigo Vivi wrote:
> devcoredump direction is that the snapshot could be taken
> even after the driver unbind. At that point the xe device
> will be gone. So, let's convert to a proper snapshot.

Would not be better to add a devcoredump uapi to remove it during Xe unload?

> 
> Cc: José Roberto de Souza <jose.souza@intel.com>
> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_devcoredump.c       |  5 +-
>  drivers/gpu/drm/xe/xe_devcoredump_types.h |  3 +-
>  drivers/gpu/drm/xe/xe_device.c            | 80 ++++++++++++++++++++---
>  drivers/gpu/drm/xe/xe_device.h            |  6 +-
>  drivers/gpu/drm/xe/xe_device_types.h      | 13 ++++
>  5 files changed, 94 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index e701f0d07b67..30e7edbb8b6f 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -63,7 +63,6 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  				   size_t count, void *data, size_t datalen)
>  {
>  	struct xe_devcoredump *coredump = data;
> -	struct xe_device *xe = coredump_to_xe(coredump);
>  	struct xe_devcoredump_snapshot *ss;
>  	struct drm_printer p;
>  	struct drm_print_iterator iter;
> @@ -90,7 +89,7 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  	drm_printf(&p, "Snapshot time: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
>  	ts = ktime_to_timespec64(ss->boot_time);
>  	drm_printf(&p, "Uptime: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
> -	xe_device_snapshot_print(xe, &p);
> +	xe_device_snapshot_print(ss->xe, &p);
>  
>  	drm_printf(&p, "\n**** GuC CT ****\n");
>  	xe_guc_ct_snapshot_print(coredump->snapshot.ct, &p);
> @@ -114,6 +113,7 @@ static void xe_devcoredump_free(void *data)
>  	if (!data || !coredump_to_xe(coredump))
>  		return;
>  
> +	xe_device_snapshot_free(coredump->snapshot.xe);
>  	xe_guc_ct_snapshot_free(coredump->snapshot.ct);
>  	xe_guc_exec_queue_snapshot_free(coredump->snapshot.ge);
>  	for (i = 0; i < XE_NUM_HW_ENGINES; i++)
> @@ -153,6 +153,7 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
>  
>  	xe_force_wake_get(gt_to_fw(q->gt), XE_FORCEWAKE_ALL);
>  
> +	coredump->snapshot.xe = xe_device_snapshot_capture(gt_to_xe(q->gt));
>  	coredump->snapshot.ct = xe_guc_ct_snapshot_capture(&guc->ct, true);
>  	coredump->snapshot.ge = xe_guc_exec_queue_snapshot_capture(job);
>  
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index 50106efcbc29..8950b1ca7456 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -26,7 +26,8 @@ struct xe_devcoredump_snapshot {
>  	/** @boot_time:  Relative boot time so the uptime can be calculated. */
>  	ktime_t boot_time;
>  
> -	/* GuC snapshots */
> +	/** @xe: Xe Device Info snapshot */
> +	struct xe_device_snapshot *xe;
>  	/** @ct: GuC CT snapshot */
>  	struct xe_guc_ct_snapshot *ct;
>  	/** @ge: Guc Engine snapshot */
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index 6faa7865b1aa..6999bb7aea42 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -728,22 +728,84 @@ void xe_device_mem_access_put(struct xe_device *xe)
>  	xe_assert(xe, ref >= 0);
>  }
>  
> -void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p)
> +/**
> + * xe_device_snapshot_capture - Take a quick snapshot of the Xe Device info.
> + * @xe: faulty Xe device.
> + *
> + * This can be printed out in a later stage like during dev_coredump
> + * analysis.
> + *
> + * Returns: a Xe device snapshot that must be freed by the
> + * caller, using `xe_device_snapshot_free`.
> + */
> +struct xe_device_snapshot *
> +xe_device_snapshot_capture(struct xe_device *xe)
>  {
> +	struct xe_device_snapshot *snapshot;
>  	struct xe_gt *gt;
>  	u8 id;
>  
> -	drm_printf(p, "PCI ID: 0x%04x\n", xe->info.devid);
> -	drm_printf(p, "PCI revision: 0x%02x\n", xe->info.revid);
> +	snapshot = kzalloc(sizeof(*snapshot), GFP_ATOMIC);
> +
> +	if (!snapshot) {
> +		drm_err(&xe->drm, "Skipping Xe Device snapshot.\n");
> +		return NULL;
> +	}
>  
> +	snapshot->devid = xe->info.devid;
> +	snapshot->revid = xe->info.revid;
> +	snapshot->gt_count = xe->info.gt_count;
> +
> +	snapshot->gt = kmalloc_array(xe->info.gt_count,
> +				    sizeof(struct gt_info_snapshot), GFP_ATOMIC);
>  	for_each_gt(gt, xe, id) {
> -		drm_printf(p, "GT id: %u\n", id);
> +		snapshot->gt[id].type = gt->info.type;
> +		snapshot->gt[id].gmdid = gt->info.gmdid;
> +		snapshot->gt[id].reference_clock = gt->info.reference_clock;
> +	}
> +
> +	return snapshot;
> +}
> +
> +/**
> + * xe_decice_snapshot_print - Print out a given Xe device snapshot.
> + * @snapshot: Xe device snapshot object.
> + * @p: drm_printer where it will be printed out.
> + *
> + * This function prints out a given GuC Submit Engine snapshot object.
> + */
> +void xe_device_snapshot_print(struct xe_device_snapshot *ss,
> +			      struct drm_printer *p)
> +{
> +	int i;
> +
> +	drm_printf(p, "PCI ID: 0x%04x\n", ss->devid);
> +	drm_printf(p, "PCI revision: 0x%02x\n", ss->revid);
> +
> +	for(i = 0; i < ss->gt_count; i++) {
> +		drm_printf(p, "GT id: %u\n", i);
>  		drm_printf(p, "\tType: %s\n",
> -			   gt->info.type == XE_GT_TYPE_MAIN ? "main" : "media");
> +			   ss->gt[i].type == XE_GT_TYPE_MAIN ? "main" : "media");
>  		drm_printf(p, "\tIP ver: %u.%u.%u\n",
> -			   REG_FIELD_GET(GMD_ID_ARCH_MASK, gt->info.gmdid),
> -			   REG_FIELD_GET(GMD_ID_RELEASE_MASK, gt->info.gmdid),
> -			   REG_FIELD_GET(GMD_ID_REVID, gt->info.gmdid));
> -		drm_printf(p, "\tCS reference clock: %u\n", gt->info.reference_clock);
> +			   REG_FIELD_GET(GMD_ID_ARCH_MASK, ss->gt[i].gmdid),
> +			   REG_FIELD_GET(GMD_ID_RELEASE_MASK, ss->gt[i].gmdid),
> +			   REG_FIELD_GET(GMD_ID_REVID, ss->gt[i].gmdid));
> +		drm_printf(p, "\tCS reference clock: %u\n", ss->gt[i].reference_clock);
>  	}
>  }
> +
> +/**
> + * xe_device_snapshot_free - Free all allocated objects for a given xe device snapshot.
> + * @snapshot: Xe device snapshot object.
> + *
> + * This function free all the memory that needed to be allocated at capture
> + * time.
> + */
> +void xe_device_snapshot_free(struct xe_device_snapshot *snapshot)
> +{
> +	if (!snapshot)
> +		return;
> +
> +	kfree(snapshot->gt);
> +	kfree(snapshot);
> +}
> diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
> index 270124da1e00..409bfbcffb4a 100644
> --- a/drivers/gpu/drm/xe/xe_device.h
> +++ b/drivers/gpu/drm/xe/xe_device.h
> @@ -175,6 +175,10 @@ static inline bool xe_device_has_memirq(struct xe_device *xe)
>  
>  u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size);
>  
> -void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p);
> +struct xe_device_snapshot *
> +xe_device_snapshot_capture(struct xe_device *xe);
> +void xe_device_snapshot_print(struct xe_device_snapshot *snapshot,
> +			      struct drm_printer *p);
> +void xe_device_snapshot_free(struct xe_device_snapshot *snapshot);
>  
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index eb2b806a1d23..50dac1a5b053 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -540,6 +540,19 @@ struct xe_device {
>  #endif
>  };
>  
> +struct gt_info_snapshot {
> +	enum xe_gt_type type;
> +	u32 gmdid;
> +	u32 reference_clock;
> +};
> +
> +struct xe_device_snapshot {
> +	u16 devid;
> +	u8 revid;
> +	u8 gt_count;
> +	struct gt_info_snapshot *gt;
> +};
> +
>  /**
>   * struct xe_file - file handle for XE driver
>   */


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-01-31 13:44 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-30 22:37 [PATCH 1/2] drm/xe: Convert xe_device_snapshot to a standalone snapshot Rodrigo Vivi
2024-01-30 22:37 ` [PATCH 2/2] drm/xe: Make a standalone snapshot to that survives unbind Rodrigo Vivi
2024-01-31  8:58   ` Jani Nikula
2024-01-31  5:02 ` ✗ CI.Patch_applied: failure for series starting with [1/2] drm/xe: Convert xe_device_snapshot to a standalone snapshot Patchwork
2024-01-31 13:44 ` [PATCH 1/2] " Souza, Jose

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.