All of lore.kernel.org
 help / color / mirror / Atom feed
From: Oded Gabbay <ogabbay@kernel.org>
To: dri-devel@lists.freedesktop.org
Cc: Tomer Tayar <ttayar@habana.ai>
Subject: [PATCH 10/12] accel/habanalabs: print qman data on error only for lower qman
Date: Tue, 16 May 2023 12:30:28 +0300	[thread overview]
Message-ID: <20230516093030.1220526-10-ogabbay@kernel.org> (raw)
In-Reply-To: <20230516093030.1220526-1-ogabbay@kernel.org>

From: Tomer Tayar <ttayar@habana.ai>

By default, the upper QMANs are not used, and instead engines ARCs
access the lower QMANs directly.
Errors for upper QMANs are therefore not expected, and the debug print
of the PQ entries is not needed.

Modify the QMAN debug data print on errors to include only information
for the lower QMAN.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c      | 146 +++---------------
 drivers/accel/habanalabs/gaudi2/gaudi2P.h     |   2 +-
 .../include/gaudi2/asic_reg/gaudi2_regs.h     |  11 ++
 3 files changed, 31 insertions(+), 128 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 6e2561ead546..4981b8eb0ff5 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -7744,137 +7744,28 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
 	return !!ecc_data->is_critical;
 }
 
-/*
- * gaudi2_queue_idx_dec - decrement queue index (pi/ci) and handle wrap
- *
- * @idx: the current pi/ci value
- * @q_len: the queue length (power of 2)
- *
- * @return the cyclically decremented index
- */
-static inline u32 gaudi2_queue_idx_dec(u32 idx, u32 q_len)
-{
-	u32 mask = q_len - 1;
-
-	/*
-	 * modular decrement is equivalent to adding (queue_size -1)
-	 * later we take LSBs to make sure the value is in the
-	 * range [0, queue_len - 1]
-	 */
-	return (idx + q_len - 1) & mask;
-}
-
-/**
- * gaudi2_print_sw_config_stream_data - print SW config stream data
- *
- * @hdev: pointer to the habanalabs device structure
- * @stream: the QMAN's stream
- * @qman_base: base address of QMAN registers block
- */
-static void gaudi2_print_sw_config_stream_data(struct hl_device *hdev,
-						u32 stream, u64 qman_base)
+static void print_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base)
 {
-	u64 cq_ptr_lo, cq_ptr_hi, cq_tsize, cq_ptr;
-	u32 cq_ptr_lo_off, size;
+	u32 lo, hi, cq_ptr_size, arc_cq_ptr_size;
+	u64 cq_ptr, arc_cq_ptr, cp_current_inst;
 
-	cq_ptr_lo_off = mmDCORE0_TPC0_QM_CQ_PTR_LO_1 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0;
-
-	cq_ptr_lo = qman_base + (mmDCORE0_TPC0_QM_CQ_PTR_LO_0 - mmDCORE0_TPC0_QM_BASE) +
-									stream * cq_ptr_lo_off;
-
-	cq_ptr_hi = cq_ptr_lo + (mmDCORE0_TPC0_QM_CQ_PTR_HI_0 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0);
-
-	cq_tsize = cq_ptr_lo + (mmDCORE0_TPC0_QM_CQ_TSIZE_0 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0);
-
-	cq_ptr = (((u64) RREG32(cq_ptr_hi)) << 32) | RREG32(cq_ptr_lo);
-	size = RREG32(cq_tsize);
-	dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %x\n",
-		stream, cq_ptr, size);
-}
-
-/**
- * gaudi2_print_last_pqes_on_err - print last PQEs on error
- *
- * @hdev: pointer to the habanalabs device structure
- * @qid_base: first QID of the QMAN (out of 4 streams)
- * @stream: the QMAN's stream
- * @qman_base: base address of QMAN registers block
- * @pr_sw_conf: if true print the SW config stream data (CQ PTR and SIZE)
- */
-static void gaudi2_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base, u32 stream,
-						u64 qman_base, bool pr_sw_conf)
-{
-	u32 ci, qm_ci_stream_off;
-	struct hl_hw_queue *q;
-	u64 pq_ci;
-	int i;
+	lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET);
+	hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET);
+	cq_ptr = ((u64) hi) << 32 | lo;
+	cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET);
 
-	q = &hdev->kernel_queues[qid_base + stream];
-
-	qm_ci_stream_off = mmDCORE0_TPC0_QM_PQ_CI_1 - mmDCORE0_TPC0_QM_PQ_CI_0;
-	pq_ci = qman_base + (mmDCORE0_TPC0_QM_PQ_CI_0 - mmDCORE0_TPC0_QM_BASE) +
-						stream * qm_ci_stream_off;
-
-	hdev->asic_funcs->hw_queues_lock(hdev);
-
-	if (pr_sw_conf)
-		gaudi2_print_sw_config_stream_data(hdev, stream, qman_base);
-
-	ci = RREG32(pq_ci);
-
-	/* we should start printing form ci -1 */
-	ci = gaudi2_queue_idx_dec(ci, HL_QUEUE_LENGTH);
-
-	for (i = 0; i < PQ_FETCHER_CACHE_SIZE; i++) {
-		struct hl_bd *bd;
-		u64 addr;
-		u32 len;
-
-		bd = q->kernel_address;
-		bd += ci;
-
-		len = le32_to_cpu(bd->len);
-		/* len 0 means uninitialized entry- break */
-		if (!len)
-			break;
-
-		addr = le64_to_cpu(bd->ptr);
-
-		dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %x\n",
-			stream, ci, addr, len);
-
-		/* get previous ci, wrap if needed */
-		ci = gaudi2_queue_idx_dec(ci, HL_QUEUE_LENGTH);
-	}
-
-	hdev->asic_funcs->hw_queues_unlock(hdev);
-}
-
-/**
- * print_qman_data_on_err - extract QMAN data on error
- *
- * @hdev: pointer to the habanalabs device structure
- * @qid_base: first QID of the QMAN (out of 4 streams)
- * @stream: the QMAN's stream
- * @qman_base: base address of QMAN registers block
- *
- * This function attempt to extract as much data as possible on QMAN error.
- * On upper CP print the SW config stream data and last 8 PQEs.
- * On lower CP print SW config data and last PQEs of ALL 4 upper CPs
- */
-static void print_qman_data_on_err(struct hl_device *hdev, u32 qid_base, u32 stream, u64 qman_base)
-{
-	u32 i;
-
-	if (stream != QMAN_STREAMS) {
-		gaudi2_print_last_pqes_on_err(hdev, qid_base, stream, qman_base, true);
-		return;
-	}
+	lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET);
+	hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET);
+	arc_cq_ptr = ((u64) hi) << 32 | lo;
+	arc_cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET);
 
-	gaudi2_print_sw_config_stream_data(hdev, stream, qman_base);
+	lo = RREG32(qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET);
+	hi = RREG32(qman_base + QM_CP_CURRENT_INST_HI_4_OFFSET);
+	cp_current_inst = ((u64) hi) << 32 | lo;
 
-	for (i = 0 ; i < QMAN_STREAMS ; i++)
-		gaudi2_print_last_pqes_on_err(hdev, qid_base, i, qman_base, false);
+	dev_info(hdev->dev,
+		"LowerQM. CQ: {ptr %#llx, size %u}, ARC_CQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n",
+		cq_ptr, cq_ptr_size, arc_cq_ptr, arc_cq_ptr_size, cp_current_inst);
 }
 
 static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type,
@@ -7912,7 +7803,8 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type
 				error_count++;
 			}
 
-		print_qman_data_on_err(hdev, qid_base, i, qman_base);
+		if (i == QMAN_STREAMS)
+			print_lower_qman_data_on_err(hdev, qman_base);
 	}
 
 	arb_err_val = RREG32(arb_err_addr);
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2P.h b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
index 1cebe707772e..5f3ce086928e 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2P.h
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
@@ -98,7 +98,7 @@
 #define GAUDI2_DEFAULT_CARD_NAME		"HL225"
 
 #define QMAN_STREAMS				4
-#define PQ_FETCHER_CACHE_SIZE			8
+
 #define NUM_OF_MME_SBTE_PORTS			5
 #define NUM_OF_MME_WB_PORTS			2
 
diff --git a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
index 6c58af614236..a08378d0802b 100644
--- a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
+++ b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
@@ -242,6 +242,17 @@
 #define QM_FENCE2_OFFSET		(mmPDMA0_QM_CP_FENCE2_RDATA_0 - mmPDMA0_QM_BASE)
 #define QM_SEI_STATUS_OFFSET		(mmPDMA0_QM_SEI_STATUS - mmPDMA0_QM_BASE)
 
+#define QM_CQ_PTR_LO_4_OFFSET		(mmPDMA0_QM_CQ_PTR_LO_4 - mmPDMA0_QM_BASE)
+#define QM_CQ_PTR_HI_4_OFFSET		(mmPDMA0_QM_CQ_PTR_HI_4 - mmPDMA0_QM_BASE)
+#define QM_CQ_TSIZE_4_OFFSET		(mmPDMA0_QM_CQ_TSIZE_4 - mmPDMA0_QM_BASE)
+
+#define QM_ARC_CQ_PTR_LO_OFFSET		(mmPDMA0_QM_ARC_CQ_PTR_LO - mmPDMA0_QM_BASE)
+#define QM_ARC_CQ_PTR_HI_OFFSET		(mmPDMA0_QM_ARC_CQ_PTR_HI - mmPDMA0_QM_BASE)
+#define QM_ARC_CQ_TSIZE_OFFSET		(mmPDMA0_QM_ARC_CQ_TSIZE - mmPDMA0_QM_BASE)
+
+#define QM_CP_CURRENT_INST_LO_4_OFFSET	(mmPDMA0_QM_CP_CURRENT_INST_LO_4 - mmPDMA0_QM_BASE)
+#define QM_CP_CURRENT_INST_HI_4_OFFSET	(mmPDMA0_QM_CP_CURRENT_INST_HI_4 - mmPDMA0_QM_BASE)
+
 #define SFT_OFFSET		(mmSFT1_HBW_RTR_IF0_RTR_H3_BASE - mmSFT0_HBW_RTR_IF0_RTR_H3_BASE)
 #define SFT_IF_RTR_OFFSET	(mmSFT0_HBW_RTR_IF1_RTR_H3_BASE - mmSFT0_HBW_RTR_IF0_RTR_H3_BASE)
 
-- 
2.40.1


  parent reply	other threads:[~2023-05-16  9:30 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-05-16  9:30 [PATCH 01/12] accel/habanalabs: rename security functions related arguments Oded Gabbay
2023-05-16  9:30 ` [PATCH 02/12] accel/habanalabs: set unused bit as reserved Oded Gabbay
2023-05-17 18:03   ` Ofir Bitton
2023-05-16  9:30 ` [PATCH 03/12] accel/habanalabs: fix mem leak in capture user mappings Oded Gabbay
2023-05-16  9:30 ` [PATCH 04/12] accel/habanalabs: align to latest firmware specs Oded Gabbay
2023-05-17 18:03   ` Ofir Bitton
2023-05-16  9:30 ` [PATCH 05/12] accel/habanalabs: print max timeout value on CS stuck Oded Gabbay
2023-05-17 18:01   ` Ofir Bitton
2023-05-16  9:30 ` [PATCH 06/12] accel/habanalabs: upon DMA errors, use FW-extracted error cause Oded Gabbay
2023-05-16  9:30 ` [PATCH 07/12] accel/habanalabs: remove support for mmu disable Oded Gabbay
2023-05-16  9:30 ` [PATCH 08/12] accel/habanalabs: use binning info when handling razwi Oded Gabbay
2023-05-16  9:30 ` [PATCH 09/12] accel/habanalabs: use lower QM in QM errors handling Oded Gabbay
2023-05-16  9:30 ` Oded Gabbay [this message]
2023-05-16  9:30 ` [PATCH 11/12] accel/habanalabs: update state when loading boot fit Oded Gabbay
2023-05-16  9:30 ` [PATCH 12/12] accel/habanalabs: mask part of hmmu page fault captured address Oded Gabbay

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230516093030.1220526-10-ogabbay@kernel.org \
    --to=ogabbay@kernel.org \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=ttayar@habana.ai \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.