dri-devel.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] accel/habanalabs: fix access error clear event
@ 2023-03-27 11:22 Oded Gabbay
  2023-03-27 11:22 ` [PATCH 2/3] accel/habanalabs: improvements to FW ver extraction Oded Gabbay
  2023-03-27 11:22 ` [PATCH 3/3] accel/habanalabs: fix HBM MMU interrupt handling Oded Gabbay
  0 siblings, 2 replies; 3+ messages in thread
From: Oded Gabbay @ 2023-03-27 11:22 UTC (permalink / raw)
  To: dri-devel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

The register which needs to be cleared is the valid register instead
of the address.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index edcbda3d9b40..bace4ac998e0 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -8884,7 +8884,7 @@ static void gaudi2_handle_access_error(struct hl_device *hdev, u64 mmu_base, boo
 
 	dev_err_ratelimited(hdev->dev, "%s access error on va 0x%llx\n",
 				is_pmmu ? "PMMU" : "HMMU", addr);
-	WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_ERROR_CAPTURE), 0);
+	WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID), 0);
 }
 
 static int gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, u16 event_type,
-- 
2.40.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/3] accel/habanalabs: improvements to FW ver extraction
  2023-03-27 11:22 [PATCH 1/3] accel/habanalabs: fix access error clear event Oded Gabbay
@ 2023-03-27 11:22 ` Oded Gabbay
  2023-03-27 11:22 ` [PATCH 3/3] accel/habanalabs: fix HBM MMU interrupt handling Oded Gabbay
  1 sibling, 0 replies; 3+ messages in thread
From: Oded Gabbay @ 2023-03-27 11:22 UTC (permalink / raw)
  To: dri-devel; +Cc: Dafna Hirschfeld

From: Dafna Hirschfeld <dhirschfeld@habana.ai>

1. Rename the func to hl_get_preboot_major_minor because we also set
   the extracted values in hdev fields.

2. Free the allocated string in the calling function which makes more
   sense

Signed-off-by: Dafna Hirschfeld <dhirschfeld@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/firmware_if.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 96027a1c124d..781256dd49ad 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -71,7 +71,7 @@ static char *extract_fw_ver_from_str(const char *fw_str)
 	return NULL;
 }
 
-static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver)
+static int hl_get_preboot_major_minor(struct hl_device *hdev, char *preboot_ver)
 {
 	char major[8], minor[8], *first_dot, *second_dot;
 	int rc;
@@ -86,7 +86,7 @@ static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver)
 
 	if (rc) {
 		dev_err(hdev->dev, "Error %d parsing preboot major version\n", rc);
-		goto out;
+		return rc;
 	}
 
 	/* skip the first dot */
@@ -102,9 +102,6 @@ static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver)
 
 	if (rc)
 		dev_err(hdev->dev, "Error %d parsing preboot minor version\n", rc);
-
-out:
-	kfree(preboot_ver);
 	return rc;
 }
 
@@ -2181,8 +2178,8 @@ static int hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev,
 
 			dev_info(hdev->dev, "preboot version %s\n", preboot_ver);
 
-			/* This function takes care of freeing preboot_ver */
-			rc = extract_fw_sub_versions(hdev, preboot_ver);
+			rc = hl_get_preboot_major_minor(hdev, preboot_ver);
+			kfree(preboot_ver);
 			if (rc)
 				return rc;
 		}
-- 
2.40.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 3/3] accel/habanalabs: fix HBM MMU interrupt handling
  2023-03-27 11:22 [PATCH 1/3] accel/habanalabs: fix access error clear event Oded Gabbay
  2023-03-27 11:22 ` [PATCH 2/3] accel/habanalabs: improvements to FW ver extraction Oded Gabbay
@ 2023-03-27 11:22 ` Oded Gabbay
  1 sibling, 0 replies; 3+ messages in thread
From: Oded Gabbay @ 2023-03-27 11:22 UTC (permalink / raw)
  To: dri-devel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Current mapping between HMMU event and HMMU block is wrong.
In addition the captured address in case of a page fault or
an access error is scrambled, Hence we must call the descramble
function.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 145 +++++++++++++++++------
 1 file changed, 108 insertions(+), 37 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index bace4ac998e0..ad491fb2c39d 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2112,6 +2112,7 @@ static bool gaudi2_get_mme_idle_status(struct hl_device *hdev, u64 *mask_arr, u8
 static bool gaudi2_get_edma_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
 		struct engines_data *e);
 static u64 gaudi2_mmu_scramble_addr(struct hl_device *hdev, u64 raw_addr);
+static u64 gaudi2_mmu_descramble_addr(struct hl_device *hdev, u64 scrambled_addr);
 
 static void gaudi2_init_scrambler_hbm(struct hl_device *hdev)
 {
@@ -8844,7 +8845,7 @@ static int gaudi2_handle_hif_fatal(struct hl_device *hdev, u16 event_type, u64 i
 static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool is_pmmu,
 					u64 *event_mask)
 {
-	u32 valid, val, axid_l, axid_h;
+	u32 valid, val;
 	u64 addr;
 
 	valid = RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID));
@@ -8857,11 +8858,11 @@ static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool
 	addr <<= 32;
 	addr |= RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_ERROR_CAPTURE_VA));
 
-	axid_l = RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_FAULT_ID_LSB));
-	axid_h = RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_FAULT_ID_MSB));
+	if (!is_pmmu)
+		addr = gaudi2_mmu_descramble_addr(hdev, addr);
 
-	dev_err_ratelimited(hdev->dev, "%s page fault on va 0x%llx, transaction id 0x%llX\n",
-				is_pmmu ? "PMMU" : "HMMU", addr, ((u64)axid_h << 32) + axid_l);
+	dev_err_ratelimited(hdev->dev, "%s page fault on va 0x%llx\n",
+				is_pmmu ? "PMMU" : "HMMU", addr);
 	hl_handle_page_fault(hdev, addr, 0, is_pmmu, event_mask);
 
 	WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID), 0);
@@ -8882,6 +8883,9 @@ static void gaudi2_handle_access_error(struct hl_device *hdev, u64 mmu_base, boo
 	addr <<= 32;
 	addr |= RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_ERROR_CAPTURE_VA));
 
+	if (!is_pmmu)
+		addr = gaudi2_mmu_descramble_addr(hdev, addr);
+
 	dev_err_ratelimited(hdev->dev, "%s access error on va 0x%llx\n",
 				is_pmmu ? "PMMU" : "HMMU", addr);
 	WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID), 0);
@@ -8976,46 +8980,110 @@ static int gaudi2_handle_sm_err(struct hl_device *hdev, u16 event_type, u8 sm_in
 	return error_count;
 }
 
+static u64 get_hmmu_base(u16 event_type)
+{
+	u8 dcore, index_in_dcore;
+
+	switch (event_type) {
+	case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP:
+	case GAUDI2_EVENT_HMMU0_SPI_BASE ... GAUDI2_EVENT_HMMU0_SECURITY_ERROR:
+		dcore = 0;
+		index_in_dcore = 0;
+	break;
+	case GAUDI2_EVENT_HMMU_1_AXI_ERR_RSP:
+	case GAUDI2_EVENT_HMMU1_SPI_BASE ... GAUDI2_EVENT_HMMU1_SECURITY_ERROR:
+		dcore = 1;
+		index_in_dcore = 0;
+	break;
+	case GAUDI2_EVENT_HMMU_2_AXI_ERR_RSP:
+	case GAUDI2_EVENT_HMMU2_SPI_BASE ... GAUDI2_EVENT_HMMU2_SECURITY_ERROR:
+		dcore = 0;
+		index_in_dcore = 1;
+	break;
+	case GAUDI2_EVENT_HMMU_3_AXI_ERR_RSP:
+	case GAUDI2_EVENT_HMMU3_SPI_BASE ... GAUDI2_EVENT_HMMU3_SECURITY_ERROR:
+		dcore = 1;
+		index_in_dcore = 1;
+	break;
+	case GAUDI2_EVENT_HMMU_4_AXI_ERR_RSP:
+	case GAUDI2_EVENT_HMMU4_SPI_BASE ... GAUDI2_EVENT_HMMU4_SECURITY_ERROR:
+		dcore = 3;
+		index_in_dcore = 2;
+	break;
+	case GAUDI2_EVENT_HMMU_5_AXI_ERR_RSP:
+	case GAUDI2_EVENT_HMMU5_SPI_BASE ... GAUDI2_EVENT_HMMU5_SECURITY_ERROR:
+		dcore = 2;
+		index_in_dcore = 2;
+	break;
+	case GAUDI2_EVENT_HMMU_6_AXI_ERR_RSP:
+	case GAUDI2_EVENT_HMMU6_SPI_BASE ... GAUDI2_EVENT_HMMU6_SECURITY_ERROR:
+		dcore = 3;
+		index_in_dcore = 3;
+	break;
+	case GAUDI2_EVENT_HMMU_7_AXI_ERR_RSP:
+	case GAUDI2_EVENT_HMMU7_SPI_BASE ... GAUDI2_EVENT_HMMU7_SECURITY_ERROR:
+		dcore = 2;
+		index_in_dcore = 3;
+	break;
+	case GAUDI2_EVENT_HMMU_8_AXI_ERR_RSP:
+	case GAUDI2_EVENT_HMMU8_SPI_BASE ... GAUDI2_EVENT_HMMU8_SECURITY_ERROR:
+		dcore = 0;
+		index_in_dcore = 2;
+	break;
+	case GAUDI2_EVENT_HMMU_9_AXI_ERR_RSP:
+	case GAUDI2_EVENT_HMMU9_SPI_BASE ... GAUDI2_EVENT_HMMU9_SECURITY_ERROR:
+		dcore = 1;
+		index_in_dcore = 2;
+	break;
+	case GAUDI2_EVENT_HMMU_10_AXI_ERR_RSP:
+	case GAUDI2_EVENT_HMMU10_SPI_BASE ... GAUDI2_EVENT_HMMU10_SECURITY_ERROR:
+		dcore = 0;
+		index_in_dcore = 3;
+	break;
+	case GAUDI2_EVENT_HMMU_11_AXI_ERR_RSP:
+	case GAUDI2_EVENT_HMMU11_SPI_BASE ... GAUDI2_EVENT_HMMU11_SECURITY_ERROR:
+		dcore = 1;
+		index_in_dcore = 3;
+	break;
+	case GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP:
+	case GAUDI2_EVENT_HMMU12_SPI_BASE ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR:
+		dcore = 3;
+		index_in_dcore = 0;
+	break;
+	case GAUDI2_EVENT_HMMU_13_AXI_ERR_RSP:
+	case GAUDI2_EVENT_HMMU13_SPI_BASE ... GAUDI2_EVENT_HMMU13_SECURITY_ERROR:
+		dcore = 2;
+		index_in_dcore = 0;
+	break;
+	case GAUDI2_EVENT_HMMU_14_AXI_ERR_RSP:
+	case GAUDI2_EVENT_HMMU14_SPI_BASE ... GAUDI2_EVENT_HMMU14_SECURITY_ERROR:
+		dcore = 3;
+		index_in_dcore = 1;
+	break;
+	case GAUDI2_EVENT_HMMU_15_AXI_ERR_RSP:
+	case GAUDI2_EVENT_HMMU15_SPI_BASE ... GAUDI2_EVENT_HMMU15_SECURITY_ERROR:
+		dcore = 2;
+		index_in_dcore = 1;
+	break;
+	default:
+		return ULONG_MAX;
+	}
+
+	return mmDCORE0_HMMU0_MMU_BASE + dcore * DCORE_OFFSET + index_in_dcore * DCORE_HMMU_OFFSET;
+}
+
 static int gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type, u64 *event_mask)
 {
 	bool is_pmmu = false;
 	u32 error_count = 0;
 	u64 mmu_base;
-	u8 index;
 
 	switch (event_type) {
-	case GAUDI2_EVENT_HMMU0_PAGE_FAULT_OR_WR_PERM ... GAUDI2_EVENT_HMMU3_SECURITY_ERROR:
-		index = (event_type - GAUDI2_EVENT_HMMU0_PAGE_FAULT_OR_WR_PERM) / 3;
-		mmu_base = mmDCORE0_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET;
-		break;
-	case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_3_AXI_ERR_RSP:
-		index = (event_type - GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP);
-		mmu_base = mmDCORE0_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET;
-		break;
-	case GAUDI2_EVENT_HMMU8_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU11_SECURITY_ERROR:
-		index = (event_type - GAUDI2_EVENT_HMMU8_PAGE_FAULT_WR_PERM) / 3;
-		mmu_base = mmDCORE1_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET;
-		break;
-	case GAUDI2_EVENT_HMMU_8_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_11_AXI_ERR_RSP:
-		index = (event_type - GAUDI2_EVENT_HMMU_8_AXI_ERR_RSP);
-		mmu_base = mmDCORE1_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET;
-		break;
-	case GAUDI2_EVENT_HMMU7_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU4_SECURITY_ERROR:
-		index = (event_type - GAUDI2_EVENT_HMMU7_PAGE_FAULT_WR_PERM) / 3;
-		mmu_base = mmDCORE2_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET;
-		break;
-	case GAUDI2_EVENT_HMMU_7_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_4_AXI_ERR_RSP:
-		index = (event_type - GAUDI2_EVENT_HMMU_7_AXI_ERR_RSP);
-		mmu_base = mmDCORE2_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET;
-		break;
-	case GAUDI2_EVENT_HMMU15_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR:
-		index = (event_type - GAUDI2_EVENT_HMMU15_PAGE_FAULT_WR_PERM) / 3;
-		mmu_base = mmDCORE3_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET;
-		break;
-	case GAUDI2_EVENT_HMMU_15_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP:
-		index = (event_type - GAUDI2_EVENT_HMMU_15_AXI_ERR_RSP);
-		mmu_base = mmDCORE3_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET;
+	case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP:
+	case GAUDI2_EVENT_HMMU0_SPI_BASE ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR:
+		mmu_base = get_hmmu_base(event_type);
 		break;
+
 	case GAUDI2_EVENT_PMMU0_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_PMMU0_SECURITY_ERROR:
 	case GAUDI2_EVENT_PMMU_AXI_ERR_RSP_0:
 		is_pmmu = true;
@@ -9025,6 +9093,9 @@ static int gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type,
 		return 0;
 	}
 
+	if (mmu_base == ULONG_MAX)
+		return 0;
+
 	error_count = gaudi2_handle_mmu_spi_sei_generic(hdev, event_type, mmu_base,
 							is_pmmu, event_mask);
 	hl_check_for_glbl_errors(hdev);
-- 
2.40.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-03-27 11:22 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-27 11:22 [PATCH 1/3] accel/habanalabs: fix access error clear event Oded Gabbay
2023-03-27 11:22 ` [PATCH 2/3] accel/habanalabs: improvements to FW ver extraction Oded Gabbay
2023-03-27 11:22 ` [PATCH 3/3] accel/habanalabs: fix HBM MMU interrupt handling Oded Gabbay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).