All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alan Previn <alan.previn.teres.alexis@intel.com>
To: intel-gfx@lists.freedesktop.org
Cc: Alan Previn <alan.previn.teres.alexis@intel.com>
Subject: [Intel-gfx] [RFC 6/7] drm/i915/guc: Copy new GuC error capture logs upon G2H notification.
Date: Mon, 22 Nov 2021 15:04:01 -0800	[thread overview]
Message-ID: <20211122230402.2023576-7-alan.previn.teres.alexis@intel.com> (raw)
In-Reply-To: <20211122230402.2023576-1-alan.previn.teres.alexis@intel.com>

Upon the G2H Notify-Err-Capture event, queue a worker to make a
snapshot of the error state capture logs from the GuC-log buffer
(error capture region) into an bigger interim circular buffer store
that can be parsed later during gpu coredump printing.

Also, call that worker function directly for the cases where we
are resetting GuC submission and need to flush outstanding logs.

Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
---
 .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   7 +
 .../gpu/drm/i915/gt/uc/intel_guc_capture.c    | 206 ++++++++++++++++++
 .../gpu/drm/i915/gt/uc/intel_guc_capture.h    |  16 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_log.c    |  16 +-
 drivers/gpu/drm/i915/gt/uc/intel_guc_log.h    |   5 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  10 +-
 6 files changed, 256 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
index 5af03a486a13..c130f465c19a 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
@@ -178,4 +178,11 @@ enum intel_guc_sleep_state_status {
 #define GUC_LOG_CONTROL_VERBOSITY_MASK	(0xF << GUC_LOG_CONTROL_VERBOSITY_SHIFT)
 #define GUC_LOG_CONTROL_DEFAULT_LOGGING	(1 << 8)
 
+enum intel_guc_state_capture_event_status {
+	INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_SUCCESS = 0x0,
+	INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_NOSPACE = 0x1,
+};
+
+#define INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_MASK      0x1
+
 #endif /* _ABI_GUC_ACTIONS_ABI_H */
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
index 0cb358a98605..459fe81c77ae 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
@@ -11,8 +11,11 @@
 #include "gt/intel_gt.h"
 #include "gt/intel_lrc_reg.h"
 
+#include <linux/circ_buf.h>
+
 #include "intel_guc_fwif.h"
 #include "intel_guc_capture.h"
+#include "i915_gpu_error.h"
 
 /*
  * Define all device tables of GuC error capture register lists
@@ -390,15 +393,218 @@ int intel_guc_capture_output_min_size_est(struct intel_guc *guc)
 	return (worst_min_size * 3);
 }
 
+/*
+ * KMD Init time flows:
+ * --------------------
+ *     --> alloc A: GuC input capture regs lists (registered via ADS)
+ *                  List acquired via intel_guc_capture_list_count + intel_guc_capture_list_init
+ *                  Size = global-reg-list + (class-reg-list) + (num-instances x instance-reg-list)
+ *                  Device tables carry: 1x global, 1x per-class, 1x per-instance)
+ *                  Caller needs to call per-class and per-instance multiplie times
+ *
+ *     --> alloc B: GuC output capture buf (registered via guc_init_params(log_param))
+ *                  Size = #define CAPTURE_BUFFER_SIZE (warns if on too-small)
+ *                  Note2: 'x 3' to hold multiple capture groups
+ *
+ *     --> alloc C: GuC capture interim circular buffer storage in system mem
+ *                  Size = 'power_of_two(sizeof(B))' as per kernel circular buffer helper
+ *
+ * GUC Runtime notify capture:
+ * --------------------------
+ *     --> G2H STATE_CAPTURE_NOTIFICATION
+ *                   L--> intel_guc_capture_store_snapshot
+ *                        L--> queue(__guc_capture_store_snapshot_work)
+ *                             Copies from B (head->tail) into C
+ */
+
+static void guc_capture_store_insert(struct intel_guc *guc, struct guc_capture_out_store *store,
+				     unsigned char *new_data, size_t bytes)
+{
+	struct drm_i915_private *dev_priv = (guc_to_gt(guc))->i915;
+	unsigned char *dst_data = store->addr;
+	unsigned long h, t;
+	size_t tmp;
+
+	h = store->head;
+	t = store->tail;
+	if (CIRC_SPACE(h, t, store->size) >= bytes) {
+		while (bytes) {
+			tmp = CIRC_SPACE_TO_END(h, t, store->size);
+			if (tmp) {
+				tmp = tmp < bytes ? tmp : bytes;
+				i915_unaligned_memcpy_from_wc(&dst_data[h], new_data, tmp);
+				bytes -= tmp;
+				new_data += tmp;
+				h = (h + tmp) & (store->size - 1);
+			} else {
+				drm_err(&dev_priv->drm, "circbuf copy-to ptr-corruption!\n");
+				break;
+			}
+		}
+		store->head = h;
+	} else {
+		drm_err(&dev_priv->drm, "GuC capture interim-store insufficient space!\n");
+	}
+}
+
+static void __guc_capture_store_snapshot_work(struct intel_guc *guc)
+{
+	struct drm_i915_private *dev_priv = (guc_to_gt(guc))->i915;
+	unsigned int buffer_size, read_offset, write_offset, bytes_to_copy, full_count;
+	struct guc_log_buffer_state *log_buf_state;
+	struct guc_log_buffer_state log_buf_state_local;
+	void *src_data, *dst_data = NULL;
+	bool new_overflow;
+
+	/* Lock to get the pointer to GuC capture-log-buffer-state */
+	mutex_lock(&guc->log_state[GUC_CAPTURE_LOG_BUFFER].lock);
+	log_buf_state = guc->log.buf_addr +
+			(sizeof(struct guc_log_buffer_state) * GUC_CAPTURE_LOG_BUFFER);
+	src_data = guc->log.buf_addr + guc_get_log_buffer_offset(GUC_CAPTURE_LOG_BUFFER);
+
+	/*
+	 * Make a copy of the state structure, inside GuC log buffer
+	 * (which is uncached mapped), on the stack to avoid reading
+	 * from it multiple times.
+	 */
+	memcpy(&log_buf_state_local, log_buf_state, sizeof(struct guc_log_buffer_state));
+	buffer_size = guc_get_log_buffer_size(GUC_CAPTURE_LOG_BUFFER);
+	read_offset = log_buf_state_local.read_ptr;
+	write_offset = log_buf_state_local.sampled_write_ptr;
+	full_count = log_buf_state_local.buffer_full_cnt;
+
+	/* Bookkeeping stuff */
+	guc->log_state[GUC_CAPTURE_LOG_BUFFER].flush += log_buf_state_local.flush_to_file;
+	new_overflow = guc_check_log_buf_overflow(guc, &guc->log_state[GUC_CAPTURE_LOG_BUFFER],
+						  full_count);
+
+	/* Update the state of shared log buffer */
+	log_buf_state->read_ptr = write_offset;
+	log_buf_state->flush_to_file = 0;
+
+	mutex_unlock(&guc->log_state[GUC_CAPTURE_LOG_BUFFER].lock);
+
+	dst_data = guc->capture.out_store.addr;
+	if (dst_data) {
+		mutex_lock(&guc->capture.out_store.lock);
+
+		/* Now copy the actual logs. */
+		if (unlikely(new_overflow)) {
+			/* copy the whole buffer in case of overflow */
+			read_offset = 0;
+			write_offset = buffer_size;
+		} else if (unlikely((read_offset > buffer_size) ||
+					(write_offset > buffer_size))) {
+			drm_err(&dev_priv->drm, "invalid GuC log capture buffer state!\n");
+			/* copy whole buffer as offsets are unreliable */
+			read_offset = 0;
+			write_offset = buffer_size;
+		}
+
+		/* first copy from the tail end of the GuC log capture buffer */
+		if (read_offset > write_offset) {
+			guc_capture_store_insert(guc, &guc->capture.out_store, src_data,
+						 write_offset);
+			bytes_to_copy = buffer_size - read_offset;
+		} else {
+			bytes_to_copy = write_offset - read_offset;
+		}
+		guc_capture_store_insert(guc, &guc->capture.out_store, src_data + read_offset,
+					 bytes_to_copy);
+
+		mutex_unlock(&guc->capture.out_store.lock);
+	}
+}
+
+static void guc_capture_store_snapshot_work(struct work_struct *work)
+{
+	struct intel_guc_state_capture *capture =
+		container_of(work, struct intel_guc_state_capture, store_work);
+	struct intel_guc *guc =
+		container_of(capture, struct intel_guc, capture);
+
+	__guc_capture_store_snapshot_work(guc);
+}
+
+void  intel_guc_capture_store_snapshot(struct intel_guc *guc)
+{
+	if (guc->capture.enabled)
+		queue_work(system_highpri_wq, &guc->capture.store_work);
+}
+
+void intel_guc_capture_store_snapshot_immediate(struct intel_guc *guc)
+{
+	if (guc->capture.enabled)
+		__guc_capture_store_snapshot_work(guc);
+}
+
+static void guc_capture_store_destroy(struct intel_guc *guc)
+{
+	mutex_destroy(&guc->capture.out_store.lock);
+	mutex_destroy(&guc->capture.out_store.lock);
+	guc->capture.out_store.size = 0;
+	kfree(guc->capture.out_store.addr);
+	guc->capture.out_store.addr = NULL;
+}
+
+static int guc_capture_store_create(struct intel_guc *guc)
+{
+	/*
+	 * Make this interim buffer 3x the GuC capture output buffer so that we can absorb
+	 * a little delay when processing the raw capture dumps into text friendly logs
+	 * for the i915_gpu_coredump output
+	 */
+	size_t max_dump_size;
+	struct drm_i915_private *dev_priv = (guc_to_gt(guc))->i915;
+
+	GEM_BUG_ON(guc->capture.out_store.addr);
+
+	max_dump_size = PAGE_ALIGN(intel_guc_capture_output_min_size_est(guc));
+	max_dump_size = roundup_pow_of_two(max_dump_size);
+
+	guc->capture.out_store.addr = kzalloc(max_dump_size, GFP_KERNEL);
+	if (!guc->capture.out_store.addr) {
+		drm_warn(&dev_priv->drm, "GuC-capture interim-store populated at init!\n");
+		return -ENOMEM;
+	}
+	guc->capture.out_store.size = max_dump_size;
+	mutex_init(&guc->capture.out_store.lock);
+	mutex_init(&guc->capture.out_store.lock);
+
+	return 0;
+}
+
 void intel_guc_capture_destroy(struct intel_guc *guc)
 {
+	if (!guc->capture.enabled)
+		return;
+
+	guc->capture.enabled = false;
+
+	intel_synchronize_irq(guc_to_gt(guc)->i915);
+	flush_work(&guc->capture.store_work);
+	guc_capture_store_destroy(guc);
 	guc_capture_clear_ext_regs(guc->capture.reglists);
 }
 
 int intel_guc_capture_init(struct intel_guc *guc)
 {
 	struct drm_i915_private *dev_priv = (guc_to_gt(guc))->i915;
+	int ret;
 
 	guc->capture.reglists = guc_capture_get_device_reglist(dev_priv);
+	/*
+	 * allocate interim store at init time so we dont require memory
+	 * allocation whilst in the midst of the reset + capture
+	 */
+	ret = guc_capture_store_create(guc);
+	if (ret) {
+		guc_capture_clear_ext_regs(guc->capture.reglists);
+		return ret;
+	}
+
+	INIT_WORK(&guc->capture.store_work, guc_capture_store_snapshot_work);
+	guc->capture.enabled = true;
+
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
index 839b53425e1e..7031de12f3a1 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
@@ -64,7 +64,19 @@ struct intel_guc_capture_out_group {
 	struct intel_guc_capture_out_data group_lists[0];
 };
 
+struct guc_capture_out_store {
+	/* An interim storage to copy the GuC error-capture-output before
+	 * parsing and reporting via proper reporting flows with formatting.
+	 */
+	unsigned char *addr;
+	size_t size;
+	unsigned long head; /* inject new output capture data */
+	unsigned long tail; /* remove output capture data when reporting */
+	struct mutex lock; /*lock head or tail when copying capture in or extracting out*/
+};
+
 struct intel_guc_state_capture {
+	bool enabled;
 	struct __guc_mmio_reg_descr_group *reglists;
 	u16 num_instance_regs[GUC_CAPTURE_LIST_INDEX_MAX][GUC_MAX_ENGINE_CLASSES];
 	u16 num_class_regs[GUC_CAPTURE_LIST_INDEX_MAX][GUC_MAX_ENGINE_CLASSES];
@@ -72,14 +84,18 @@ struct intel_guc_state_capture {
 	int instance_list_size;
 	int class_list_size;
 	int global_list_size;
+	struct guc_capture_out_store out_store;
+	struct work_struct store_work;
 };
 
+void intel_guc_capture_store_snapshot(struct intel_guc *guc);
 int intel_guc_capture_list_count(struct intel_guc *guc, u32 owner, u32 type, u32 class,
 				 u16 *num_entries);
 int intel_guc_capture_list_init(struct intel_guc *guc, u32 owner, u32 type, u32 class,
 				struct guc_mmio_reg *ptr, u16 num_entries);
 int intel_guc_capture_output_min_size_est(struct intel_guc *guc);
 void intel_guc_capture_destroy(struct intel_guc *guc);
+void intel_guc_capture_store_snapshot_immediate(struct intel_guc *guc);
 int intel_guc_capture_init(struct intel_guc *guc);
 
 #endif /* _INTEL_GUC_CAPTURE_H */
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
index dd86530f77a1..1354dbde9994 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
@@ -194,7 +194,7 @@ bool guc_check_log_buf_overflow(struct intel_guc *guc,
 	return overflow;
 }
 
-static unsigned int guc_get_log_buffer_size(enum guc_log_buffer_type type)
+unsigned int guc_get_log_buffer_size(enum guc_log_buffer_type type)
 {
 	switch (type) {
 	case GUC_DEBUG_LOG_BUFFER:
@@ -210,6 +210,20 @@ static unsigned int guc_get_log_buffer_size(enum guc_log_buffer_type type)
 	return 0;
 }
 
+size_t guc_get_log_buffer_offset(enum guc_log_buffer_type type)
+{
+	enum guc_log_buffer_type i;
+	size_t offset = PAGE_SIZE;/* for the log_buffer_states */
+
+	for (i = GUC_DEBUG_LOG_BUFFER; i < GUC_MAX_LOG_BUFFER; i++) {
+		if (i == type)
+			break;
+		offset += guc_get_log_buffer_size(i);
+	}
+
+	return offset;
+}
+
 static void _guc_log_copy_debuglogs_for_relay(struct intel_guc_log *log)
 {
 	struct intel_guc *guc = log_to_guc(log);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h
index 2968023f7447..9bf29343df0e 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h
@@ -64,8 +64,13 @@ struct intel_guc_log {
 };
 
 void intel_guc_log_init_early(struct intel_guc_log *log);
+unsigned int guc_get_log_buffer_size(enum guc_log_buffer_type type);
+size_t guc_get_log_buffer_offset(enum guc_log_buffer_type type);
 int intel_guc_log_create(struct intel_guc_log *log);
 void intel_guc_log_destroy(struct intel_guc_log *log);
+ 
+bool guc_check_log_buf_overflow(struct intel_guc *guc, struct intel_guc_log_stats *state,
+				unsigned int full_cnt);
 
 int intel_guc_log_set_level(struct intel_guc_log *log, u32 level);
 bool intel_guc_log_relay_created(const struct intel_guc_log *log);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 0bfc92b1b982..0afd9ddd71fc 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -24,6 +24,7 @@
 
 #include "intel_guc_ads.h"
 #include "intel_guc_submission.h"
+#include "gt/uc/intel_guc_capture.h"
 
 #include "i915_drv.h"
 #include "i915_trace.h"
@@ -1431,6 +1432,8 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc)
 	}
 
 	scrub_guc_desc_for_outstanding_g2h(guc);
+
+	intel_guc_capture_store_snapshot_immediate(guc);
 }
 
 static struct intel_engine_cs *
@@ -4013,10 +4016,11 @@ int intel_guc_error_capture_process_msg(struct intel_guc *guc,
 		return -EPROTO;
 	}
 
-	status = msg[0];
-	drm_info(&guc_to_gt(guc)->i915->drm, "Got error capture: status = %d", status);
+	status = msg[0] & INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_MASK;
+	if (status == INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_NOSPACE)
+		drm_warn(&guc_to_gt(guc)->i915->drm, "G2H-Error capture no space\n");
 
-	/* Add extraction of error capture dump */
+	intel_guc_capture_store_snapshot(guc);
 
 	return 0;
 }
-- 
2.25.1


  parent reply	other threads:[~2021-11-22 23:03 UTC|newest]

Thread overview: 52+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-11-22 23:03 [RFC 0/7] Add GuC Error Capture Support Alan Previn
2021-11-22 23:03 ` [Intel-gfx] " Alan Previn
2021-11-22 23:03 ` [Intel-gfx] [RFC 1/7] drm/i915/guc: Add basic support for error capture lists Alan Previn
2021-11-23 21:12   ` Michal Wajdeczko
2021-12-08 18:23     ` Teres Alexis, Alan Previn
2021-11-22 23:03 ` [Intel-gfx] [RFC 2/7] drm/i915/guc: Update GuC ADS size " Alan Previn
2021-11-23 21:46   ` Michal Wajdeczko
2021-11-24  9:52     ` Jani Nikula
2021-11-24 17:34     ` Teres Alexis, Alan Previn
2021-12-21 23:15       ` Teres Alexis, Alan Previn
2021-12-22  1:49       ` Teres Alexis, Alan Previn
2021-12-22 20:13     ` Teres Alexis, Alan Previn
2021-11-24 10:06   ` Jani Nikula
2021-11-24 17:37     ` Teres Alexis, Alan Previn
2021-11-22 23:03 ` [Intel-gfx] [RFC 3/7] drm/i915/guc: Populate XE_LP register lists for GuC error state capture Alan Previn
2021-11-23  1:59   ` kernel test robot
2021-11-23 21:55   ` Michal Wajdeczko
2021-11-24 17:16     ` Teres Alexis, Alan Previn
2021-11-22 23:03 ` [Intel-gfx] [RFC 4/7] drm/i915/guc: Add GuC's error state capture output structures Alan Previn
2021-11-24 10:08   ` Jani Nikula
2021-11-24 17:37     ` Teres Alexis, Alan Previn
2021-12-07 21:01   ` Matthew Brost
2021-12-07 23:35     ` Teres Alexis, Alan Previn
2021-11-22 23:04 ` [Intel-gfx] [RFC 5/7] drm/i915/guc: Update GuC's log-buffer-state access for error capture Alan Previn
2021-12-07 22:31   ` Matthew Brost
2021-12-07 23:33     ` Teres Alexis, Alan Previn
2021-12-07 23:30       ` Matthew Brost
2021-11-22 23:04 ` Alan Previn [this message]
2021-12-07 22:58   ` [Intel-gfx] [RFC 6/7] drm/i915/guc: Copy new GuC error capture logs upon G2H notification Matthew Brost
2021-12-08  5:14     ` Teres Alexis, Alan Previn
2021-12-08 18:22       ` Teres Alexis, Alan Previn
2021-11-22 23:04 ` [Intel-gfx] [RFC 7/7] drm/i915/guc: Print the GuC error capture output register list Alan Previn
2021-11-23  0:25   ` Teres Alexis, Alan Previn
2021-12-08  0:22   ` Matthew Brost
2021-12-08  6:31     ` Teres Alexis, Alan Previn
2021-12-23 18:54       ` Teres Alexis, Alan Previn
2021-12-24 12:09         ` Tvrtko Ursulin
2021-12-24 13:34           ` Teres Alexis, Alan Previn
2022-01-04 13:56             ` Tvrtko Ursulin
2022-01-05 17:30               ` Teres Alexis, Alan Previn
2022-01-06  9:38                 ` Tvrtko Ursulin
2022-01-06 18:33                   ` Teres Alexis, Alan Previn
2022-01-07  9:03                     ` Tvrtko Ursulin
2022-01-07 17:03                       ` Teres Alexis, Alan Previn
2022-01-10  8:07                         ` Tvrtko Ursulin
2022-01-10 18:19                           ` Teres Alexis, Alan Previn
2022-01-11 10:08                             ` Tvrtko Ursulin
2022-01-14  7:16                               ` Teres Alexis, Alan Previn
2021-11-22 23:44 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for Add GuC Error Capture Support Patchwork
2021-11-22 23:45 ` [Intel-gfx] ✗ Fi.CI.SPARSE: " Patchwork
2021-11-23  0:16 ` [Intel-gfx] ✗ Fi.CI.BAT: failure " Patchwork
2021-11-23  0:40 ` [Intel-gfx] ✗ Fi.CI.BUILD: failure for Add GuC Error Capture Support (rev2) Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20211122230402.2023576-7-alan.previn.teres.alexis@intel.com \
    --to=alan.previn.teres.alexis@intel.com \
    --cc=intel-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.