All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions
@ 2024-02-20 16:01 Oded Gabbay
  2024-02-20 16:01 ` [PATCH 02/13] accel/habanalabs: remove hop size from asic properties Oded Gabbay
                   ` (12 more replies)
  0 siblings, 13 replies; 28+ messages in thread
From: Oded Gabbay @ 2024-02-20 16:01 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Ohad Sharabi

From: Ohad Sharabi <osharabi@habana.ai>

Currently, the code contains 2 types of FW version comparison functions:
- hl_is_fw_sw_ver_[below/equal_or_greater]()
- gaudi2 specific function of the type
  gaudi2_is_fw_ver_[below/above]x_y_z()

Moreover, some functions use the inner FW version which should be only
stage during development but not version dependencies.

Finally, some tests are done to deprecated FW version to which LKD
should hold no compatibility.

This commit aligns all APIs to a single function that just compares the
version and return an integers indicator (similar in some way to
strcmp()).

In addition, this generic function now considers also the sub-minor FW
version and also remove dead code resulting in deprecated FW versions
compatibility.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/firmware_if.c | 25 ++++++++
 drivers/accel/habanalabs/common/habanalabs.h  | 20 +------
 drivers/accel/habanalabs/gaudi2/gaudi2.c      | 57 +++----------------
 3 files changed, 34 insertions(+), 68 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 3558a6a8e192..e7dcf2fe6552 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -40,6 +40,31 @@ static char *comms_sts_str_arr[COMMS_STS_INVLD_LAST] = {
 	[COMMS_STS_TIMEOUT_ERR] = __stringify(COMMS_STS_TIMEOUT_ERR),
 };
 
+/**
+ * hl_fw_version_cmp() - compares the FW version to a specific version
+ *
+ * @hdev: pointer to hl_device structure
+ * @major: major number of a reference version
+ * @minor: minor number of a reference version
+ * @subminor: sub-minor number of a reference version
+ *
+ * Return 1 if FW version greater than the reference version, -1 if it's
+ *         smaller and 0 if versions are identical.
+ */
+int hl_fw_version_cmp(struct hl_device *hdev, u32 major, u32 minor, u32 subminor)
+{
+	if (hdev->fw_sw_major_ver != major)
+		return (hdev->fw_sw_major_ver > major) ? 1 : -1;
+
+	if (hdev->fw_sw_minor_ver != minor)
+		return (hdev->fw_sw_minor_ver > minor) ? 1 : -1;
+
+	if (hdev->fw_sw_sub_minor_ver != subminor)
+		return (hdev->fw_sw_sub_minor_ver > subminor) ? 1 : -1;
+
+	return 0;
+}
+
 static char *extract_fw_ver_from_str(const char *fw_str)
 {
 	char *str, *fw_ver, *whitespace;
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 7397ce86b7f0..634a470efe27 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -3600,25 +3600,6 @@ struct hl_ioctl_desc {
 	hl_ioctl_t *func;
 };
 
-static inline bool hl_is_fw_sw_ver_below(struct hl_device *hdev, u32 fw_sw_major, u32 fw_sw_minor)
-{
-	if (hdev->fw_sw_major_ver < fw_sw_major)
-		return true;
-	if (hdev->fw_sw_major_ver > fw_sw_major)
-		return false;
-	if (hdev->fw_sw_minor_ver < fw_sw_minor)
-		return true;
-	return false;
-}
-
-static inline bool hl_is_fw_sw_ver_equal_or_greater(struct hl_device *hdev, u32 fw_sw_major,
-							u32 fw_sw_minor)
-{
-	return (hdev->fw_sw_major_ver > fw_sw_major ||
-			(hdev->fw_sw_major_ver == fw_sw_major &&
-					hdev->fw_sw_minor_ver >= fw_sw_minor));
-}
-
 /*
  * Kernel module functions that can be accessed by entire module
  */
@@ -3923,6 +3904,7 @@ void hl_mmu_dr_flush(struct hl_ctx *ctx);
 int hl_mmu_dr_init(struct hl_device *hdev);
 void hl_mmu_dr_fini(struct hl_device *hdev);
 
+int hl_fw_version_cmp(struct hl_device *hdev, u32 major, u32 minor, u32 subminor);
 int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
 				void __iomem *dst, u32 src_offset, u32 size);
 int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode, u64 value);
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 1f061209ae21..4a0917aa4dd7 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2601,6 +2601,8 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 
 	prop->hbw_flush_reg = mmPCIE_WRAP_SPECIAL_GLBL_SPARE_0;
 
+	prop->supports_advanced_cpucp_rc = true;
+
 	return 0;
 
 free_qprops:
@@ -3308,8 +3310,6 @@ static int gaudi2_late_init(struct hl_device *hdev)
 	struct gaudi2_device *gaudi2 = hdev->asic_specific;
 	int rc;
 
-	hdev->asic_prop.supports_advanced_cpucp_rc = true;
-
 	rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS,
 					gaudi2->virt_msix_db_dma_addr);
 	if (rc) {
@@ -3783,7 +3783,7 @@ static int gaudi2_sw_init(struct hl_device *hdev)
 	prop->supports_compute_reset = true;
 
 	/* Event queue sanity check added in FW version 1.11 */
-	if (hl_is_fw_sw_ver_below(hdev, 1, 11))
+	if (hl_fw_version_cmp(hdev, 1, 11, 0) < 0)
 		hdev->event_queue.check_eqe_index = false;
 	else
 		hdev->event_queue.check_eqe_index = true;
@@ -6314,26 +6314,6 @@ static void gaudi2_execute_hard_reset(struct hl_device *hdev)
 	WREG32(mmPSOC_RESET_CONF_SW_ALL_RST, 1);
 }
 
-static int gaudi2_get_soft_rst_done_indication(struct hl_device *hdev, u32 poll_timeout_us)
-{
-	int i, rc = 0;
-	u32 reg_val;
-
-	for (i = 0 ; i < GAUDI2_RESET_POLL_CNT ; i++)
-		rc = hl_poll_timeout(
-			hdev,
-			mmCPU_RST_STATUS_TO_HOST,
-			reg_val,
-			reg_val == CPU_RST_STATUS_SOFT_RST_DONE,
-			1000,
-			poll_timeout_us);
-
-	if (rc)
-		dev_err(hdev->dev, "Timeout while waiting for FW to complete soft reset (0x%x)\n",
-				reg_val);
-	return rc;
-}
-
 /**
  * gaudi2_execute_soft_reset - execute soft reset by driver/FW
  *
@@ -6346,23 +6326,8 @@ static int gaudi2_get_soft_rst_done_indication(struct hl_device *hdev, u32 poll_
 static int gaudi2_execute_soft_reset(struct hl_device *hdev, bool driver_performs_reset,
 						u32 poll_timeout_us)
 {
-	int rc;
-
-	if (!driver_performs_reset) {
-		if (hl_is_fw_sw_ver_below(hdev, 1, 10)) {
-			/* set SP to indicate reset request sent to FW */
-			WREG32(mmCPU_RST_STATUS_TO_HOST, CPU_RST_STATUS_NA);
-
-			WREG32(mmGIC_HOST_SOFT_RST_IRQ_POLL_REG,
-				gaudi2_irq_map_table[GAUDI2_EVENT_CPU_SOFT_RESET].cpu_id);
-
-			/* wait for f/w response */
-			rc = gaudi2_get_soft_rst_done_indication(hdev, poll_timeout_us);
-		} else {
-			rc = hl_fw_send_soft_reset(hdev);
-		}
-		return rc;
-	}
+	if (!driver_performs_reset)
+		return hl_fw_send_soft_reset(hdev);
 
 	/* Block access to engines, QMANs and SM during reset, these
 	 * RRs will be reconfigured after soft reset.
@@ -7914,7 +7879,7 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
 	bool has_block_id = false;
 	u16 block_id;
 
-	if (!hl_is_fw_sw_ver_below(hdev, 1, 12))
+	if (hl_fw_version_cmp(hdev, 1, 12, 0) >= 0)
 		has_block_id = true;
 
 	ecc_address = le64_to_cpu(ecc_data->ecc_address);
@@ -8165,13 +8130,7 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 		}
 
 		hbw_rtr_id = gaudi2_tpc_initiator_hbw_rtr_id[module_idx];
-
-		if (hl_is_fw_sw_ver_below(hdev, 1, 9) &&
-				!hdev->asic_prop.fw_security_enabled &&
-				((module_idx == 0) || (module_idx == 1)))
-			lbw_rtr_id = DCORE0_RTR0;
-		else
-			lbw_rtr_id = gaudi2_tpc_initiator_lbw_rtr_id[module_idx];
+		lbw_rtr_id = gaudi2_tpc_initiator_lbw_rtr_id[module_idx];
 		break;
 	case RAZWI_MME:
 		sprintf(initiator_name, "MME_%u", module_idx);
@@ -10080,7 +10039,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		error_count = gaudi2_handle_pcie_drain(hdev, &eq_entry->pcie_drain_ind_data);
 		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
-		if (hl_is_fw_sw_ver_equal_or_greater(hdev, 1, 13))
+		if (hl_fw_version_cmp(hdev, 1, 13, 0) >= 0)
 			is_critical = true;
 		break;
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 02/13] accel/habanalabs: remove hop size from asic properties
  2024-02-20 16:01 [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Oded Gabbay
@ 2024-02-20 16:01 ` Oded Gabbay
  2024-02-23 22:42   ` Carl Vanderlip
  2024-02-20 16:01 ` [PATCH 03/13] accel/habanalabs: modify print for skip loading linux FW to debug log Oded Gabbay
                   ` (11 subsequent siblings)
  12 siblings, 1 reply; 28+ messages in thread
From: Oded Gabbay @ 2024-02-20 16:01 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Farah Kassabri

From: Farah Kassabri <fkassabri@habana.ai>

The hop size related properties is a MMU properties and not
asic properties.
As for PMMU and HMMU we could have different sizes.

Signed-off-by: Farah Kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/habanalabs.h  |  4 ----
 drivers/accel/habanalabs/common/mmu/mmu.c     | 22 ++++++++---------
 .../accel/habanalabs/common/mmu/mmu_v2_hr.c   | 24 +++++++++----------
 drivers/accel/habanalabs/gaudi/gaudi.c        |  8 +++----
 drivers/accel/habanalabs/gaudi2/gaudi2.c      | 12 ++++------
 drivers/accel/habanalabs/goya/goya.c          | 12 ++++------
 6 files changed, 36 insertions(+), 46 deletions(-)

diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 634a470efe27..c85849aefba6 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -594,8 +594,6 @@ struct hl_hints_range {
  *                 we display to the user
  * @mmu_pgt_size: MMU page tables total size.
  * @mmu_pte_size: PTE size in MMU page tables.
- * @mmu_hop_table_size: MMU hop table size.
- * @mmu_hop0_tables_total_size: total size of MMU hop0 tables.
  * @dram_page_size: The DRAM physical page size.
  * @cfg_size: configuration space size on SRAM.
  * @sram_size: total size of SRAM.
@@ -747,8 +745,6 @@ struct asic_fixed_properties {
 	u32				clk_pll_index;
 	u32				mmu_pgt_size;
 	u32				mmu_pte_size;
-	u32				mmu_hop_table_size;
-	u32				mmu_hop0_tables_total_size;
 	u32				dram_page_size;
 	u32				cfg_size;
 	u32				sram_size;
diff --git a/drivers/accel/habanalabs/common/mmu/mmu.c b/drivers/accel/habanalabs/common/mmu/mmu.c
index fa7919dba783..d3eaab908457 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu.c
@@ -1236,7 +1236,7 @@ void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info)
 	struct hl_device *hdev = ctx->hdev;
 
 	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, pgt_info->phys_addr,
-			hdev->asic_prop.mmu_hop_table_size);
+			hdev->asic_prop.dmmu.hop_table_size);
 	hash_del(&pgt_info->node);
 	kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
 	kfree(pgt_info);
@@ -1245,18 +1245,18 @@ void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info)
 u64 hl_mmu_dr_get_phys_hop0_addr(struct hl_ctx *ctx)
 {
 	return ctx->hdev->asic_prop.mmu_pgt_addr +
-			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
+			(ctx->asid * ctx->hdev->asic_prop.dmmu.hop_table_size);
 }
 
 u64 hl_mmu_dr_get_hop0_addr(struct hl_ctx *ctx)
 {
 	return (u64) (uintptr_t) ctx->hdev->mmu_priv.dr.mmu_shadow_hop0 +
-			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
+			(ctx->asid * ctx->hdev->asic_prop.dmmu.hop_table_size);
 }
 
 u64 hl_mmu_dr_get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr)
 {
-	u64 page_mask = ctx->hdev->asic_prop.mmu_hop_table_size - 1;
+	u64 page_mask = ctx->hdev->asic_prop.dmmu.hop_table_size - 1;
 	u64 shadow_hop_addr = shadow_addr & (~page_mask);
 	u64 pte_offset = shadow_addr & page_mask;
 	u64 phys_hop_addr;
@@ -1326,13 +1326,13 @@ u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx)
 		return ULLONG_MAX;
 
 	phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.dr.mmu_pgt_pool,
-					prop->mmu_hop_table_size);
+					prop->dmmu.hop_table_size);
 	if (!phys_addr) {
 		dev_err(hdev->dev, "failed to allocate page\n");
 		goto pool_add_err;
 	}
 
-	shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size,
+	shadow_addr = (u64) (uintptr_t) kzalloc(prop->dmmu.hop_table_size,
 						GFP_KERNEL);
 	if (!shadow_addr)
 		goto shadow_err;
@@ -1347,7 +1347,7 @@ u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx)
 
 shadow_err:
 	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool,
-			phys_addr, prop->mmu_hop_table_size);
+			phys_addr, prop->dmmu.hop_table_size);
 pool_add_err:
 	kfree(pgt_info);
 
@@ -1379,7 +1379,7 @@ int hl_mmu_dr_init(struct hl_device *hdev)
 	int rc;
 
 	hdev->mmu_priv.dr.mmu_pgt_pool =
-			gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
+			gen_pool_create(__ffs(prop->dmmu.hop_table_size), -1);
 
 	if (!hdev->mmu_priv.dr.mmu_pgt_pool) {
 		dev_err(hdev->dev, "Failed to create page gen pool\n");
@@ -1387,8 +1387,8 @@ int hl_mmu_dr_init(struct hl_device *hdev)
 	}
 
 	rc = gen_pool_add(hdev->mmu_priv.dr.mmu_pgt_pool, prop->mmu_pgt_addr +
-			prop->mmu_hop0_tables_total_size,
-			prop->dmmu.pgt_size - prop->mmu_hop0_tables_total_size,
+			prop->dmmu.hop0_tables_total_size,
+			prop->dmmu.pgt_size - prop->dmmu.hop0_tables_total_size,
 			-1);
 	if (rc) {
 		dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
@@ -1396,7 +1396,7 @@ int hl_mmu_dr_init(struct hl_device *hdev)
 	}
 
 	hdev->mmu_priv.dr.mmu_shadow_hop0 = kvcalloc(prop->max_asid,
-						prop->mmu_hop_table_size, GFP_KERNEL);
+						prop->dmmu.hop_table_size, GFP_KERNEL);
 	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) {
 		rc = -ENOMEM;
 		goto err_pool_add;
diff --git a/drivers/accel/habanalabs/common/mmu/mmu_v2_hr.c b/drivers/accel/habanalabs/common/mmu/mmu_v2_hr.c
index afe7ef964f82..31507b2a431b 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu_v2_hr.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu_v2_hr.c
@@ -47,7 +47,7 @@ static inline int hl_mmu_v2_hr_init(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 
-	return hl_mmu_hr_init(hdev, &hdev->mmu_priv.hr, prop->mmu_hop_table_size,
+	return hl_mmu_hr_init(hdev, &hdev->mmu_priv.hr, prop->pmmu.hop_table_size,
 				prop->mmu_pgt_size);
 }
 
@@ -65,7 +65,7 @@ static inline void hl_mmu_v2_hr_fini(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 
-	hl_mmu_hr_fini(hdev, &hdev->mmu_priv.hr, prop->mmu_hop_table_size);
+	hl_mmu_hr_fini(hdev, &hdev->mmu_priv.hr, prop->pmmu.hop_table_size);
 }
 
 /**
@@ -108,7 +108,7 @@ static void hl_mmu_v2_hr_ctx_fini(struct hl_ctx *ctx)
 			"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
 			pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
 		hl_mmu_hr_free_hop_remove_pgt(pgt_info, &ctx->hdev->mmu_priv.hr,
-							ctx->hdev->asic_prop.mmu_hop_table_size);
+							ctx->hdev->asic_prop.pmmu.hop_table_size);
 	}
 }
 
@@ -150,7 +150,7 @@ static int _hl_mmu_v2_hr_unmap(struct hl_ctx *ctx,
 
 		curr_pte = *(u64 *) (uintptr_t) hl_mmu_hr_pte_phys_to_virt(ctx, hops_pgt_info[i],
 							hop_pte_phys_addr[i],
-							ctx->hdev->asic_prop.mmu_hop_table_size);
+							ctx->hdev->asic_prop.pmmu.hop_table_size);
 
 		if ((i < hop_last) && (curr_pte & mmu_prop->last_mask)) {
 			hop_last = i;
@@ -169,14 +169,14 @@ static int _hl_mmu_v2_hr_unmap(struct hl_ctx *ctx,
 
 	for (i = hop_last ; i > 0 ; i--) {
 		hl_mmu_hr_clear_pte(ctx, hops_pgt_info[i], hop_pte_phys_addr[i],
-						ctx->hdev->asic_prop.mmu_hop_table_size);
+						ctx->hdev->asic_prop.pmmu.hop_table_size);
 
 		if (hl_mmu_hr_put_pte(ctx, hops_pgt_info[i], &ctx->hdev->mmu_priv.hr,
-						ctx->hdev->asic_prop.mmu_hop_table_size))
+						ctx->hdev->asic_prop.pmmu.hop_table_size))
 			goto mapped;
 	}
 	hl_mmu_hr_clear_pte(ctx, hops_pgt_info[0], hop_pte_phys_addr[0],
-						ctx->hdev->asic_prop.mmu_hop_table_size);
+						ctx->hdev->asic_prop.pmmu.hop_table_size);
 
 mapped:
 	return 0;
@@ -255,7 +255,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
 									scrambled_virt_addr);
 		curr_pte = *(u64 *) (uintptr_t) hl_mmu_hr_pte_phys_to_virt(ctx, hops_pgt_info[i],
 							hop_pte_phys_addr[i],
-							ctx->hdev->asic_prop.mmu_hop_table_size);
+							ctx->hdev->asic_prop.pmmu.hop_table_size);
 	}
 
 	if (curr_pte & PAGE_PRESENT_MASK) {
@@ -268,7 +268,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
 					*(u64 *) (uintptr_t)
 					hl_mmu_hr_pte_phys_to_virt(ctx, hops_pgt_info[i],
 							hop_pte_phys_addr[i],
-							ctx->hdev->asic_prop.mmu_hop_table_size),
+							ctx->hdev->asic_prop.pmmu.hop_table_size),
 					hop_pte_phys_addr[i]);
 		rc = -EINVAL;
 		goto err;
@@ -279,7 +279,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
 
 	/* Write the PTEs */
 	hl_mmu_hr_write_pte(ctx, hops_pgt_info[hop_last], hop_pte_phys_addr[hop_last], curr_pte,
-							ctx->hdev->asic_prop.mmu_hop_table_size);
+							ctx->hdev->asic_prop.pmmu.hop_table_size);
 
 	/* for each new hop, add its address to the table of previous-hop */
 	for (i = 1 ; i <= hop_last ; i++) {
@@ -287,7 +287,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
 			curr_pte = (hops_pgt_info[i]->phys_addr & HOP_PHYS_ADDR_MASK) |
 							PAGE_PRESENT_MASK;
 			hl_mmu_hr_write_pte(ctx, hops_pgt_info[i - 1], hop_pte_phys_addr[i - 1],
-						curr_pte, ctx->hdev->asic_prop.mmu_hop_table_size);
+						curr_pte, ctx->hdev->asic_prop.pmmu.hop_table_size);
 			if (i - 1)
 				hl_mmu_hr_get_pte(ctx, &ctx->hdev->mmu_func[MMU_HR_PGT].hr_funcs,
 								hops_pgt_info[i - 1]->phys_addr);
@@ -303,7 +303,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
 	for (i = 1 ; i <= hop_last ; i++)
 		if (hop_new[i] && hops_pgt_info[i])
 			hl_mmu_hr_free_hop_remove_pgt(hops_pgt_info[i], &ctx->hdev->mmu_priv.hr,
-							ctx->hdev->asic_prop.mmu_hop_table_size);
+							ctx->hdev->asic_prop.pmmu.hop_table_size);
 
 	return rc;
 }
diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index dde3839fe0e0..f2b04ffb0ecb 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -614,8 +614,6 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 	else
 		prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE;
 	prop->mmu_pte_size = HL_PTE_SIZE;
-	prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE;
-	prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 	prop->dram_page_size = PAGE_SIZE_2MB;
 	prop->device_mem_alloc_default_page_size = prop->dram_page_size;
 	prop->dram_supports_virtual_memory = false;
@@ -637,8 +635,8 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 	prop->pmmu.num_hops = MMU_ARCH_5_HOPS;
 	prop->pmmu.last_mask = LAST_MASK;
 	/* TODO: will be duplicated until implementing per-MMU props */
-	prop->pmmu.hop_table_size = prop->mmu_hop_table_size;
-	prop->pmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+	prop->pmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+	prop->pmmu.hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 
 	/* PMMU and HPMMU are the same except of page size */
 	memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu));
@@ -3653,7 +3651,7 @@ static int gaudi_mmu_init(struct hl_device *hdev)
 
 	for (i = 0 ; i < prop->max_asid ; i++) {
 		hop0_addr = prop->mmu_pgt_addr +
-				(i * prop->mmu_hop_table_size);
+				(i * prop->dmmu.hop_table_size);
 
 		rc = gaudi_mmu_update_asid_hop0_addr(hdev, i, hop0_addr);
 		if (rc) {
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 4a0917aa4dd7..26975179763a 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2467,8 +2467,6 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 
 	prop->dmmu.pgt_size = HMMU_PAGE_TABLES_SIZE;
 	prop->mmu_pte_size = HL_PTE_SIZE;
-	prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE;
-	prop->mmu_hop0_tables_total_size = HOP_TABLE_SIZE_512_PTE * prop->max_asid;
 
 	prop->dmmu.hop_shifts[MMU_HOP0] = DHOP0_SHIFT;
 	prop->dmmu.hop_shifts[MMU_HOP1] = DHOP1_SHIFT;
@@ -2482,8 +2480,8 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 	prop->dmmu.num_hops = MMU_ARCH_4_HOPS;
 	prop->dmmu.last_mask = LAST_MASK;
 	prop->dmmu.host_resident = 0;
-	prop->dmmu.hop_table_size = prop->mmu_hop_table_size;
-	prop->dmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+	prop->dmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+	prop->dmmu.hop0_tables_total_size = HOP_TABLE_SIZE_512_PTE * prop->max_asid;
 
 	/* As we need to set the pgt address in dram for HMMU init so we cannot
 	 * wait to the fw cpucp info to set the dram props as mmu init comes before
@@ -2500,8 +2498,8 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 	prop->pmmu.host_resident = 1;
 	prop->pmmu.num_hops = MMU_ARCH_6_HOPS;
 	prop->pmmu.last_mask = LAST_MASK;
-	prop->pmmu.hop_table_size = prop->mmu_hop_table_size;
-	prop->pmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+	prop->pmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+	prop->pmmu.hop0_tables_total_size = HOP_TABLE_SIZE_512_PTE * prop->max_asid;
 
 	prop->hints_host_reserved_va_range.start_addr = RESERVED_VA_FOR_VIRTUAL_MSIX_DOORBELL_START;
 	prop->hints_host_reserved_va_range.end_addr = RESERVED_VA_RANGE_FOR_ARC_ON_HOST_END;
@@ -5934,7 +5932,7 @@ static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base,
 		if (host_resident_pgt)
 			hop0_addr = hdev->mmu_priv.hr.mmu_asid_hop0[asid].phys_addr;
 		else
-			hop0_addr = prop->mmu_pgt_addr + (asid * prop->mmu_hop_table_size);
+			hop0_addr = prop->mmu_pgt_addr + (asid * prop->dmmu.hop_table_size);
 
 		rc = gaudi2_mmu_update_asid_hop0_addr(hdev, stlb_base, asid, hop0_addr);
 		if (rc) {
diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c
index 1322cb330c57..5a359c3bdc78 100644
--- a/drivers/accel/habanalabs/goya/goya.c
+++ b/drivers/accel/habanalabs/goya/goya.c
@@ -413,8 +413,6 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 	else
 		prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE;
 	prop->mmu_pte_size = HL_PTE_SIZE;
-	prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE;
-	prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 	prop->dram_page_size = PAGE_SIZE_2MB;
 	prop->device_mem_alloc_default_page_size = prop->dram_page_size;
 	prop->dram_supports_virtual_memory = true;
@@ -435,8 +433,8 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 	prop->dmmu.num_hops = MMU_ARCH_5_HOPS;
 	prop->dmmu.last_mask = LAST_MASK;
 	/* TODO: will be duplicated until implementing per-MMU props */
-	prop->dmmu.hop_table_size = prop->mmu_hop_table_size;
-	prop->dmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+	prop->dmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+	prop->dmmu.hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 
 	/* shifts and masks are the same in PMMU and DMMU */
 	memcpy(&prop->pmmu, &prop->dmmu, sizeof(prop->dmmu));
@@ -446,8 +444,8 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 	prop->pmmu.num_hops = MMU_ARCH_5_HOPS;
 	prop->pmmu.last_mask = LAST_MASK;
 	/* TODO: will be duplicated until implementing per-MMU props */
-	prop->pmmu.hop_table_size = prop->mmu_hop_table_size;
-	prop->pmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+	prop->pmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+	prop->pmmu.hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 
 	/* PMMU and HPMMU are the same except of page size */
 	memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu));
@@ -2678,7 +2676,7 @@ int goya_mmu_init(struct hl_device *hdev)
 
 	for (i = 0 ; i < prop->max_asid ; i++) {
 		hop0_addr = prop->mmu_pgt_addr +
-				(i * prop->mmu_hop_table_size);
+				(i * prop->dmmu.hop_table_size);
 
 		rc = goya_mmu_update_asid_hop0_addr(hdev, i, hop0_addr);
 		if (rc) {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 03/13] accel/habanalabs: modify print for skip loading linux FW to debug log
  2024-02-20 16:01 [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Oded Gabbay
  2024-02-20 16:01 ` [PATCH 02/13] accel/habanalabs: remove hop size from asic properties Oded Gabbay
@ 2024-02-20 16:01 ` Oded Gabbay
  2024-02-23 22:45   ` Carl Vanderlip
  2024-02-20 16:01 ` [PATCH 04/13] accel/habanalabs/gaudi2: check extended errors according to PCIe addr_dec interrupt info Oded Gabbay
                   ` (10 subsequent siblings)
  12 siblings, 1 reply; 28+ messages in thread
From: Oded Gabbay @ 2024-02-20 16:01 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

Skip loading a linux FW image into the device with the current supported
ASICs is done for test purposes only.
Moreover, for future supported ASICs it is possible that there won't be
a need to load such an image.
The print in such a case is therefore not needed in most cases, so
replace the used dev_info() with dev_dbg().

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/firmware_if.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index e7dcf2fe6552..364d292c76fa 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -2820,7 +2820,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 	hdev->asic_funcs->init_cpu_scrambler_dram(hdev);
 
 	if (!(hdev->fw_components & FW_TYPE_LINUX)) {
-		dev_info(hdev->dev, "Skip loading Linux F/W\n");
+		dev_dbg(hdev->dev, "Skip loading Linux F/W\n");
 		return 0;
 	}
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 04/13] accel/habanalabs/gaudi2: check extended errors according to PCIe addr_dec interrupt info
  2024-02-20 16:01 [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Oded Gabbay
  2024-02-20 16:01 ` [PATCH 02/13] accel/habanalabs: remove hop size from asic properties Oded Gabbay
  2024-02-20 16:01 ` [PATCH 03/13] accel/habanalabs: modify print for skip loading linux FW to debug log Oded Gabbay
@ 2024-02-20 16:01 ` Oded Gabbay
  2024-02-23 22:49   ` Carl Vanderlip
  2024-02-20 16:01 ` [PATCH 05/13] accel/habanalabs: fix glbl error cause handling Oded Gabbay
                   ` (9 subsequent siblings)
  12 siblings, 1 reply; 28+ messages in thread
From: Oded Gabbay @ 2024-02-20 16:01 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

The FW interrupt info for a PCIe addr_dec event is set correctly, so
check for either global errors or razwi according to the indications
there.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 26975179763a..671241735a6e 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -8942,9 +8942,6 @@ static int gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u16 event_typ
 	u32 error_count = 0;
 	int i;
 
-	gaudi2_print_event(hdev, event_type, true,
-		"intr_cause_data: %#llx", intr_cause_data);
-
 	for (i = 0 ; i < GAUDI2_NUM_OF_PCIE_ADDR_DEC_ERR_CAUSE ; i++) {
 		if (!(intr_cause_data & BIT_ULL(i)))
 			continue;
@@ -8953,15 +8950,16 @@ static int gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u16 event_typ
 			"err cause: %s", gaudi2_pcie_addr_dec_error_cause[i]);
 		error_count++;
 
-		/*
-		 * Always check for LBW and HBW additional info as the indication itself is
-		 * sometimes missing
-		 */
+		switch (intr_cause_data & BIT_ULL(i)) {
+		case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_AXI_LBW_ERR_INTR_MASK:
+			hl_check_for_glbl_errors(hdev);
+			break;
+		case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_BAD_ACCESS_INTR_MASK:
+			gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev, event_mask);
+			break;
+		}
 	}
 
-	hl_check_for_glbl_errors(hdev);
-	gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev, event_mask);
-
 	return error_count;
 }
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 05/13] accel/habanalabs: fix glbl error cause handling
  2024-02-20 16:01 [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Oded Gabbay
                   ` (2 preceding siblings ...)
  2024-02-20 16:01 ` [PATCH 04/13] accel/habanalabs/gaudi2: check extended errors according to PCIe addr_dec interrupt info Oded Gabbay
@ 2024-02-20 16:01 ` Oded Gabbay
  2024-02-23 23:01   ` Carl Vanderlip
  2024-02-20 16:01 ` [PATCH 06/13] accel/habanalabs: fix debugfs files permissions Oded Gabbay
                   ` (8 subsequent siblings)
  12 siblings, 1 reply; 28+ messages in thread
From: Oded Gabbay @ 2024-02-20 16:01 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

The glbl error cause handling has a wrong assumption that all error
bits are consecutive.
Fix the handling to check all relevant error bits per ASIC.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/habanalabs.h |  4 +--
 drivers/accel/habanalabs/common/security.c   | 33 +++++++++++++++-----
 drivers/accel/habanalabs/common/security.h   |  3 +-
 drivers/accel/habanalabs/gaudi2/gaudi2.c     | 10 +++---
 drivers/accel/habanalabs/gaudi2/gaudi2P.h    |  3 +-
 5 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index c85849aefba6..40107a4eba93 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -647,7 +647,7 @@ struct hl_hints_range {
  * @num_engine_cores: number of engine cpu cores.
  * @max_num_of_engines: maximum number of all engines in the ASIC.
  * @num_of_special_blocks: special_blocks array size.
- * @glbl_err_cause_num: global err cause number.
+ * @glbl_err_max_cause_num: global err max cause number.
  * @hbw_flush_reg: register to read to generate HBW flush. value of 0 means HBW flush is
  *                 not supported.
  * @reserved_fw_mem_size: size in MB of dram memory reserved for FW.
@@ -779,7 +779,7 @@ struct asic_fixed_properties {
 	u32				num_engine_cores;
 	u32				max_num_of_engines;
 	u32				num_of_special_blocks;
-	u32				glbl_err_cause_num;
+	u32				glbl_err_max_cause_num;
 	u32				hbw_flush_reg;
 	u32				reserved_fw_mem_size;
 	u16				collective_first_sob;
diff --git a/drivers/accel/habanalabs/common/security.c b/drivers/accel/habanalabs/common/security.c
index fe913965dbad..5402a3cd0491 100644
--- a/drivers/accel/habanalabs/common/security.c
+++ b/drivers/accel/habanalabs/common/security.c
@@ -7,15 +7,31 @@
 
 #include "habanalabs.h"
 
-static const char * const hl_glbl_error_cause[HL_MAX_NUM_OF_GLBL_ERR_CAUSE] = {
+static const char * const hl_glbl_error_cause[] = {
 	"Error due to un-priv read",
 	"Error due to un-secure read",
 	"Error due to read from unmapped reg",
 	"Error due to un-priv write",
 	"Error due to un-secure write",
 	"Error due to write to unmapped reg",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
 	"External I/F write sec violation",
 	"External I/F write to un-mapped reg",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
 	"Read to write only",
 	"Write to read only"
 };
@@ -671,10 +687,11 @@ static bool hl_check_block_range_exclusion(struct hl_device *hdev,
 static int hl_read_glbl_errors(struct hl_device *hdev,
 		u32 blk_idx, u32 major, u32 minor, u32 sub_minor, void *data)
 {
-	struct hl_special_block_info *special_blocks = hdev->asic_prop.special_blocks;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_special_block_info *special_blocks = prop->special_blocks;
 	struct hl_special_block_info *current_block = &special_blocks[blk_idx];
 	u32 glbl_err_addr, glbl_err_cause, addr_val, cause_val, block_base,
-		base = current_block->base_addr - lower_32_bits(hdev->asic_prop.cfg_base_address);
+		base = current_block->base_addr - lower_32_bits(prop->cfg_base_address);
 	int i;
 
 	block_base = base + major * current_block->major_offset +
@@ -689,13 +706,13 @@ static int hl_read_glbl_errors(struct hl_device *hdev,
 	glbl_err_addr = block_base + HL_GLBL_ERR_ADDR_OFFSET;
 	addr_val = RREG32(glbl_err_addr);
 
-	for (i = 0 ; i < hdev->asic_prop.glbl_err_cause_num ; i++) {
+	for (i = 0 ; i <= prop->glbl_err_max_cause_num ; i++) {
 		if (cause_val & BIT(i))
 			dev_err_ratelimited(hdev->dev,
-				"%s, addr %#llx\n",
-				hl_glbl_error_cause[i],
-				hdev->asic_prop.cfg_base_address + block_base +
-				FIELD_GET(HL_GLBL_ERR_ADDRESS_MASK, addr_val));
+					"%s, addr %#llx\n",
+					hl_glbl_error_cause[i],
+					prop->cfg_base_address + block_base +
+						FIELD_GET(HL_GLBL_ERR_ADDRESS_MASK, addr_val));
 	}
 
 	WREG32(glbl_err_cause, cause_val);
diff --git a/drivers/accel/habanalabs/common/security.h b/drivers/accel/habanalabs/common/security.h
index d7a3b3e82ea4..476f70687c09 100644
--- a/drivers/accel/habanalabs/common/security.h
+++ b/drivers/accel/habanalabs/common/security.h
@@ -13,8 +13,7 @@
 struct hl_device;
 
 /* special blocks */
-#define HL_MAX_NUM_OF_GLBL_ERR_CAUSE		10
-#define HL_GLBL_ERR_ADDRESS_MASK		GENMASK(11, 0)
+#define HL_GLBL_ERR_ADDRESS_MASK	GENMASK(11, 0)
 /* GLBL_ERR_ADDR register offset from the start of the block */
 #define HL_GLBL_ERR_ADDR_OFFSET		0xF44
 /* GLBL_ERR_CAUSE register offset from the start of the block */
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 671241735a6e..189d8da6a624 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -158,11 +158,13 @@
 #define RAZWI_INITIATOR_ID_X_Y(xl, yl, xh) \
 	(RAZWI_INITIATOR_ID_X_Y_LOW(xl, yl) | RAZWI_INITIATOR_ID_X_HIGH(xh))
 
-#define PSOC_RAZWI_ENG_STR_SIZE 128
-#define PSOC_RAZWI_MAX_ENG_PER_RTR 5
+#define PSOC_RAZWI_ENG_STR_SIZE			128
+#define PSOC_RAZWI_MAX_ENG_PER_RTR		5
 
 /* HW scrambles only bits 0-25 */
-#define HW_UNSCRAMBLED_BITS_MASK GENMASK_ULL(63, 26)
+#define HW_UNSCRAMBLED_BITS_MASK		GENMASK_ULL(63, 26)
+
+#define GAUDI2_GLBL_ERR_MAX_CAUSE_NUM		17
 
 struct gaudi2_razwi_info {
 	u32 axuser_xy;
@@ -3587,7 +3589,7 @@ static int gaudi2_special_blocks_config(struct hl_device *hdev)
 	int i, rc;
 
 	/* Configure Special blocks */
-	prop->glbl_err_cause_num = GAUDI2_NUM_OF_GLBL_ERR_CAUSE;
+	prop->glbl_err_max_cause_num = GAUDI2_GLBL_ERR_MAX_CAUSE_NUM;
 	prop->num_of_special_blocks = ARRAY_SIZE(gaudi2_special_blocks);
 	prop->special_blocks = kmalloc_array(prop->num_of_special_blocks,
 			sizeof(*prop->special_blocks), GFP_KERNEL);
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2P.h b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
index bc508c9cee5c..eee41387b269 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2P.h
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
@@ -237,9 +237,8 @@
 #define GAUDI2_SOB_INCREMENT_BY_ONE	(FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_VAL_MASK, 1) | \
 					FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_INC_MASK, 1))
 
-#define GAUDI2_NUM_TESTED_QS (GAUDI2_QUEUE_ID_CPU_PQ - GAUDI2_QUEUE_ID_PDMA_0_0)
+#define GAUDI2_NUM_TESTED_QS		(GAUDI2_QUEUE_ID_CPU_PQ - GAUDI2_QUEUE_ID_PDMA_0_0)
 
-#define GAUDI2_NUM_OF_GLBL_ERR_CAUSE		8
 
 enum gaudi2_reserved_sob_id {
 	GAUDI2_RESERVED_SOB_CS_COMPLETION_FIRST,
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 06/13] accel/habanalabs: fix debugfs files permissions
  2024-02-20 16:01 [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Oded Gabbay
                   ` (3 preceding siblings ...)
  2024-02-20 16:01 ` [PATCH 05/13] accel/habanalabs: fix glbl error cause handling Oded Gabbay
@ 2024-02-20 16:01 ` Oded Gabbay
  2024-02-23 23:10   ` Carl Vanderlip
  2024-02-20 16:01 ` [PATCH 07/13] accel/habanalabs: initialize maybe-uninitialized variables Oded Gabbay
                   ` (7 subsequent siblings)
  12 siblings, 1 reply; 28+ messages in thread
From: Oded Gabbay @ 2024-02-20 16:01 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Avri Kehat

From: Avri Kehat <akehat@habana.ai>

debugfs files are created with permissions that don't align
with the access requirements.

Signed-off-by: Avri Kehat <akehat@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/debugfs.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/accel/habanalabs/common/debugfs.c b/drivers/accel/habanalabs/common/debugfs.c
index 01f071d52570..ab0fe74b49d0 100644
--- a/drivers/accel/habanalabs/common/debugfs.c
+++ b/drivers/accel/habanalabs/common/debugfs.c
@@ -1643,19 +1643,19 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
 				&hl_data64b_fops);
 
 	debugfs_create_file("set_power_state",
-				0200,
+				0644,
 				root,
 				dev_entry,
 				&hl_power_fops);
 
 	debugfs_create_file("device",
-				0200,
+				0644,
 				root,
 				dev_entry,
 				&hl_device_fops);
 
 	debugfs_create_file("clk_gate",
-				0200,
+				0644,
 				root,
 				dev_entry,
 				&hl_clk_gate_fops);
@@ -1667,13 +1667,13 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
 				&hl_stop_on_err_fops);
 
 	debugfs_create_file("dump_security_violations",
-				0644,
+				0400,
 				root,
 				dev_entry,
 				&hl_security_violations_fops);
 
 	debugfs_create_file("dump_razwi_events",
-				0644,
+				0400,
 				root,
 				dev_entry,
 				&hl_razwi_check_fops);
@@ -1706,7 +1706,7 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
 				&hdev->reset_info.skip_reset_on_timeout);
 
 	debugfs_create_file("state_dump",
-				0600,
+				0644,
 				root,
 				dev_entry,
 				&hl_state_dump_fops);
@@ -1724,7 +1724,7 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
 
 	for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
 		debugfs_create_file(hl_debugfs_list[i].name,
-					0444,
+					0644,
 					root,
 					entry,
 					&hl_debugfs_fops);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 07/13] accel/habanalabs: initialize maybe-uninitialized variables
  2024-02-20 16:01 [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Oded Gabbay
                   ` (4 preceding siblings ...)
  2024-02-20 16:01 ` [PATCH 06/13] accel/habanalabs: fix debugfs files permissions Oded Gabbay
@ 2024-02-20 16:01 ` Oded Gabbay
  2024-02-23 23:12   ` Carl Vanderlip
  2024-02-20 16:01 ` [PATCH 08/13] accel/habanalabs: fix error print Oded Gabbay
                   ` (6 subsequent siblings)
  12 siblings, 1 reply; 28+ messages in thread
From: Oded Gabbay @ 2024-02-20 16:01 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Tal Risin

From: Tal Risin <trisin@habana.ai>

Prevent static analysis warning.

Signed-off-by: Tal Risin <trisin@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/debugfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/common/debugfs.c b/drivers/accel/habanalabs/common/debugfs.c
index ab0fe74b49d0..b1c88d1837d9 100644
--- a/drivers/accel/habanalabs/common/debugfs.c
+++ b/drivers/accel/habanalabs/common/debugfs.c
@@ -484,7 +484,7 @@ static ssize_t mmu_asid_va_write(struct file *file, const char __user *buf,
 	struct hl_debugfs_entry *entry = s->private;
 	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
 	struct hl_device *hdev = dev_entry->hdev;
-	char kbuf[MMU_KBUF_SIZE];
+	char kbuf[MMU_KBUF_SIZE] = {0};
 	char *c;
 	ssize_t rc;
 
@@ -546,7 +546,7 @@ static ssize_t mmu_ack_error_value_write(struct file *file,
 	struct hl_debugfs_entry *entry = s->private;
 	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
 	struct hl_device *hdev = dev_entry->hdev;
-	char kbuf[MMU_KBUF_SIZE];
+	char kbuf[MMU_KBUF_SIZE] = {0};
 	ssize_t rc;
 
 	if (count > sizeof(kbuf) - 1)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 08/13] accel/habanalabs: fix error print
  2024-02-20 16:01 [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Oded Gabbay
                   ` (5 preceding siblings ...)
  2024-02-20 16:01 ` [PATCH 07/13] accel/habanalabs: initialize maybe-uninitialized variables Oded Gabbay
@ 2024-02-20 16:01 ` Oded Gabbay
  2024-02-23 23:15   ` Carl Vanderlip
  2024-02-20 16:01 ` [PATCH 09/13] accel/habanalabs/gaudi2: drain event lacks rd/wr indication Oded Gabbay
                   ` (5 subsequent siblings)
  12 siblings, 1 reply; 28+ messages in thread
From: Oded Gabbay @ 2024-02-20 16:01 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

The unmasking is for event and it can be other event than RAZWI.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/firmware_if.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 364d292c76fa..a3df7cf162d8 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -526,7 +526,7 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
 						0, &result);
 
 	if (rc)
-		dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
+		dev_err(hdev->dev, "failed to unmask event %d", event_type);
 
 	return rc;
 }
@@ -565,7 +565,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
 						total_pkt_size, 0, &result);
 
 	if (rc)
-		dev_err(hdev->dev, "failed to unmask IRQ array\n");
+		dev_err(hdev->dev, "failed to unmask event array\n");
 
 	kfree(pkt);
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 09/13] accel/habanalabs/gaudi2: drain event lacks rd/wr indication
  2024-02-20 16:01 [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Oded Gabbay
                   ` (6 preceding siblings ...)
  2024-02-20 16:01 ` [PATCH 08/13] accel/habanalabs: fix error print Oded Gabbay
@ 2024-02-20 16:01 ` Oded Gabbay
  2024-02-23 23:17   ` Carl Vanderlip
  2024-02-20 16:01 ` [PATCH 10/13] accel/habanalabs/hwmon: rate limit errors user can generate Oded Gabbay
                   ` (4 subsequent siblings)
  12 siblings, 1 reply; 28+ messages in thread
From: Oded Gabbay @ 2024-02-20 16:01 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Due to a H/W issue, AXI drain event does not include a read/write
indication, hence we remove this print.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 189d8da6a624..ba1518f2bf5c 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -9548,25 +9548,17 @@ static int gaudi2_handle_pcie_p2p_msix(struct hl_device *hdev, u16 event_type)
 static int gaudi2_handle_pcie_drain(struct hl_device *hdev,
 			struct hl_eq_pcie_drain_ind_data *drain_data)
 {
-	u64 lbw_rd, lbw_wr, hbw_rd, hbw_wr, cause, error_count = 0;
+	u64 cause, error_count = 0;
 
 	cause = le64_to_cpu(drain_data->intr_cause.intr_cause_data);
-	lbw_rd = le64_to_cpu(drain_data->drain_rd_addr_lbw);
-	lbw_wr = le64_to_cpu(drain_data->drain_wr_addr_lbw);
-	hbw_rd = le64_to_cpu(drain_data->drain_rd_addr_hbw);
-	hbw_wr = le64_to_cpu(drain_data->drain_wr_addr_hbw);
 
 	if (cause & BIT_ULL(0)) {
-		dev_err_ratelimited(hdev->dev,
-			"PCIE AXI drain LBW completed, read_err %u, write_err %u\n",
-			!!lbw_rd, !!lbw_wr);
+		dev_err_ratelimited(hdev->dev, "PCIE AXI drain LBW completed\n");
 		error_count++;
 	}
 
 	if (cause & BIT_ULL(1)) {
-		dev_err_ratelimited(hdev->dev,
-			"PCIE AXI drain HBW completed, raddr %#llx, waddr %#llx\n",
-			hbw_rd, hbw_wr);
+		dev_err_ratelimited(hdev->dev, "PCIE AXI drain HBW completed\n");
 		error_count++;
 	}
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 10/13] accel/habanalabs/hwmon: rate limit errors user can generate
  2024-02-20 16:01 [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Oded Gabbay
                   ` (7 preceding siblings ...)
  2024-02-20 16:01 ` [PATCH 09/13] accel/habanalabs/gaudi2: drain event lacks rd/wr indication Oded Gabbay
@ 2024-02-20 16:01 ` Oded Gabbay
  2024-02-23 23:21   ` Carl Vanderlip
  2024-02-20 16:01 ` [PATCH 11/13] accel/habanalabs: handle reserved memory request when working with full FW Oded Gabbay
                   ` (3 subsequent siblings)
  12 siblings, 1 reply; 28+ messages in thread
From: Oded Gabbay @ 2024-02-20 16:01 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Fetching sensor data can fail due to various reasons. In order
not to pollute the kernel log, those error prints must be
rate limited.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/hwmon.c | 29 +++++++++++++------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/accel/habanalabs/common/hwmon.c b/drivers/accel/habanalabs/common/hwmon.c
index 1ee2ee07e9ed..36b951b5f503 100644
--- a/drivers/accel/habanalabs/common/hwmon.c
+++ b/drivers/accel/habanalabs/common/hwmon.c
@@ -46,7 +46,7 @@ static u32 fixup_flags_legacy_fw(struct hl_device *hdev, enum hwmon_sensor_types
 		break;
 
 	default:
-		dev_err(hdev->dev, "unsupported h/w sensor type %d\n", type);
+		dev_err_ratelimited(hdev->dev, "unsupported h/w sensor type %d\n", type);
 		flags = cpucp_flags;
 		break;
 	}
@@ -134,7 +134,7 @@ static u32 adjust_hwmon_flags(struct hl_device *hdev, enum hwmon_sensor_types ty
 			break;
 
 		default:
-			dev_err(hdev->dev, "unsupported h/w sensor type %d\n", type);
+			dev_err_ratelimited(hdev->dev, "unsupported h/w sensor type %d\n", type);
 			flags = cpucp_flags;
 			break;
 		}
@@ -162,7 +162,8 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev, struct cpucp_sensor *sen
 			break;
 
 		if (type >= HWMON_NR_SENSOR_TYPES) {
-			dev_err(hdev->dev, "Got wrong sensor type %d from device\n", type);
+			dev_err_ratelimited(hdev->dev,
+				"Got wrong sensor type %d from device\n", type);
 			return -EINVAL;
 		}
 
@@ -584,7 +585,7 @@ int hl_get_temperature(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to get temperature from sensor %d, error %d\n",
 			sensor_index, rc);
 		*value = 0;
@@ -611,7 +612,7 @@ int hl_set_temperature(struct hl_device *hdev,
 						0, NULL);
 
 	if (rc)
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to set temperature of sensor %d, error %d\n",
 			sensor_index, rc);
 
@@ -638,7 +639,7 @@ int hl_get_voltage(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to get voltage from sensor %d, error %d\n",
 			sensor_index, rc);
 		*value = 0;
@@ -667,7 +668,7 @@ int hl_get_current(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to get current from sensor %d, error %d\n",
 			sensor_index, rc);
 		*value = 0;
@@ -696,7 +697,7 @@ int hl_get_fan_speed(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to get fan speed from sensor %d, error %d\n",
 			sensor_index, rc);
 		*value = 0;
@@ -725,7 +726,7 @@ int hl_get_pwm_info(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to get pwm info from sensor %d, error %d\n",
 			sensor_index, rc);
 		*value = 0;
@@ -752,7 +753,7 @@ void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
 						0, NULL);
 
 	if (rc)
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to set pwm info to sensor %d, error %d\n",
 			sensor_index, rc);
 }
@@ -775,7 +776,7 @@ int hl_set_voltage(struct hl_device *hdev,
 						0, NULL);
 
 	if (rc)
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to set voltage of sensor %d, error %d\n",
 			sensor_index, rc);
 
@@ -800,7 +801,7 @@ int hl_set_current(struct hl_device *hdev,
 						0, NULL);
 
 	if (rc)
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to set current of sensor %d, error %d\n",
 			sensor_index, rc);
 
@@ -831,7 +832,7 @@ int hl_set_power(struct hl_device *hdev,
 						0, NULL);
 
 	if (rc)
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to set power of sensor %d, error %d\n",
 			sensor_index, rc);
 
@@ -858,7 +859,7 @@ int hl_get_power(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
+		dev_err_ratelimited(hdev->dev,
 			"Failed to get power of sensor %d, error %d\n",
 			sensor_index, rc);
 		*value = 0;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 11/13] accel/habanalabs: handle reserved memory request when working with full FW
  2024-02-20 16:01 [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Oded Gabbay
                   ` (8 preceding siblings ...)
  2024-02-20 16:01 ` [PATCH 10/13] accel/habanalabs/hwmon: rate limit errors user can generate Oded Gabbay
@ 2024-02-20 16:01 ` Oded Gabbay
  2024-02-23 23:24   ` Carl Vanderlip
  2024-02-20 16:01 ` [PATCH 12/13] accel/habanalabs: keep explicit size of reserved memory for FW Oded Gabbay
                   ` (2 subsequent siblings)
  12 siblings, 1 reply; 28+ messages in thread
From: Oded Gabbay @ 2024-02-20 16:01 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

Currently the reserved memory request from FW is handled when running
with preboot only, but this request is relevant also when running with
full FW.
Modify to always handle this reservation request.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/firmware_if.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index a3df7cf162d8..4246162b6807 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -2743,18 +2743,20 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
 	}
 
+	rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader, sizeof(struct lkd_msg_comms));
+	if (rc)
+		goto protocol_err;
+
+	if (hdev->asic_prop.support_dynamic_resereved_fw_size)
+		hdev->asic_prop.reserved_fw_mem_size =
+				le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb);
+
 	if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) {
 		struct lkd_fw_binning_info *binning_info;
 
-		rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader,
-							sizeof(struct lkd_msg_comms));
-		if (rc)
-			goto protocol_err;
-
 		/* read preboot version */
 		rc = hl_fw_dynamic_read_device_fw_version(hdev, FW_COMP_PREBOOT,
 				fw_loader->dynamic_loader.comm_desc.cur_fw_ver);
-
 		if (rc)
 			return rc;
 
@@ -2781,11 +2783,6 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 				hdev->decoder_binning, hdev->rotator_binning);
 		}
 
-		if (hdev->asic_prop.support_dynamic_resereved_fw_size) {
-			hdev->asic_prop.reserved_fw_mem_size =
-				le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb);
-		}
-
 		return 0;
 	}
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 12/13] accel/habanalabs: keep explicit size of reserved memory for FW
  2024-02-20 16:01 [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Oded Gabbay
                   ` (9 preceding siblings ...)
  2024-02-20 16:01 ` [PATCH 11/13] accel/habanalabs: handle reserved memory request when working with full FW Oded Gabbay
@ 2024-02-20 16:01 ` Oded Gabbay
  2024-02-23 23:26   ` Carl Vanderlip
  2024-02-20 16:01 ` [PATCH 13/13] accel/habanalabs: modify pci health check Oded Gabbay
  2024-02-23 22:38 ` [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Carl Vanderlip
  12 siblings, 1 reply; 28+ messages in thread
From: Oded Gabbay @ 2024-02-20 16:01 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

The reserved memory for FW is currently saved in an ASIC property in
units of MB, just like the value that comes from FW.
Except the fact that it is not clear from the property's name, it means
also that a calculation to actual size is required everywhere that it is
used.
Modify the property to hold the size in bytes.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/firmware_if.c | 2 +-
 drivers/accel/habanalabs/common/habanalabs.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 4246162b6807..348418643709 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -2749,7 +2749,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 
 	if (hdev->asic_prop.support_dynamic_resereved_fw_size)
 		hdev->asic_prop.reserved_fw_mem_size =
-				le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb);
+			le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb) * SZ_1M;
 
 	if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) {
 		struct lkd_fw_binning_info *binning_info;
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 40107a4eba93..55495861f432 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -650,7 +650,7 @@ struct hl_hints_range {
  * @glbl_err_max_cause_num: global err max cause number.
  * @hbw_flush_reg: register to read to generate HBW flush. value of 0 means HBW flush is
  *                 not supported.
- * @reserved_fw_mem_size: size in MB of dram memory reserved for FW.
+ * @reserved_fw_mem_size: size of dram memory reserved for FW.
  * @collective_first_sob: first sync object available for collective use
  * @collective_first_mon: first monitor available for collective use
  * @sync_stream_first_sob: first sync object available for sync stream use
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 13/13] accel/habanalabs: modify pci health check
  2024-02-20 16:01 [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Oded Gabbay
                   ` (10 preceding siblings ...)
  2024-02-20 16:01 ` [PATCH 12/13] accel/habanalabs: keep explicit size of reserved memory for FW Oded Gabbay
@ 2024-02-20 16:01 ` Oded Gabbay
  2024-02-23 23:32   ` Carl Vanderlip
  2024-02-23 22:38 ` [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Carl Vanderlip
  12 siblings, 1 reply; 28+ messages in thread
From: Oded Gabbay @ 2024-02-20 16:01 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Today we read PCI VENDOR-ID in order to make sure PCI link is
healthy. Apparently the VENDOR-ID might be stored on host and
hence, when we read it we might not access the PCI bus.
In order to make sure PCI health check is reliable, we will start
checking the DEVICE-ID instead.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 3b9e8a21d7df..8f92445c5a90 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1035,14 +1035,14 @@ static void device_early_fini(struct hl_device *hdev)
 
 static bool is_pci_link_healthy(struct hl_device *hdev)
 {
-	u16 vendor_id;
+	u16 device_id;
 
 	if (!hdev->pdev)
 		return false;
 
-	pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id);
+	pci_read_config_word(hdev->pdev, PCI_DEVICE_ID, &device_id);
 
-	return (vendor_id == PCI_VENDOR_ID_HABANALABS);
+	return (device_id == hdev->pdev->device);
 }
 
 static int hl_device_eq_heartbeat_check(struct hl_device *hdev)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions
  2024-02-20 16:01 [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Oded Gabbay
                   ` (11 preceding siblings ...)
  2024-02-20 16:01 ` [PATCH 13/13] accel/habanalabs: modify pci health check Oded Gabbay
@ 2024-02-23 22:38 ` Carl Vanderlip
  2024-02-25 10:36   ` Ohad Sharabi
  12 siblings, 1 reply; 28+ messages in thread
From: Carl Vanderlip @ 2024-02-23 22:38 UTC (permalink / raw)
  To: Oded Gabbay, dri-devel, linux-kernel; +Cc: Ohad Sharabi

On 2/20/2024 8:01 AM, Oded Gabbay wrote:> From: Ohad Sharabi 
<osharabi@habana.ai>
 >
 > Currently, the code contains 2 types of FW version comparison functions:
 > - hl_is_fw_sw_ver_[below/equal_or_greater]()
 > - gaudi2 specific function of the type
 >    gaudi2_is_fw_ver_[below/above]x_y_z()
 >
 > Moreover, some functions use the inner FW version which should be only
 > stage during development but not version dependencies.
 >
 > Finally, some tests are done to deprecated FW version to which LKD
 > should hold no compatibility.
 >
 > This commit aligns all APIs to a single function that just compares the
 > version and return an integers indicator (similar in some way to
 > strcmp()).
 >
 > In addition, this generic function now considers also the sub-minor FW
 > version and also remove dead code resulting in deprecated FW versions
 > compatibility.
 >
 > Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
 > Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
 > Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
 > ---
 >   drivers/accel/habanalabs/common/firmware_if.c | 25 ++++++++
 >   drivers/accel/habanalabs/common/habanalabs.h  | 20 +------
 >   drivers/accel/habanalabs/gaudi2/gaudi2.c      | 57 +++----------------
 >   3 files changed, 34 insertions(+), 68 deletions(-)
 >
...
 > diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c 
b/drivers/accel/habanalabs/gaudi2/gaudi2.c
 > index 1f061209ae21..4a0917aa4dd7 100644
 > --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
 > +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
 > @@ -2601,6 +2601,8 @@ static int gaudi2_set_fixed_properties(struct 
hl_device *hdev)
 >
 >   	prop->hbw_flush_reg = mmPCIE_WRAP_SPECIAL_GLBL_SPARE_0;
 >
 > +	prop->supports_advanced_cpucp_rc = true;
 > +
 >   	return 0;
 >
 >   free_qprops:
 > @@ -3308,8 +3310,6 @@ static int gaudi2_late_init(struct hl_device *hdev)
 >   	struct gaudi2_device *gaudi2 = hdev->asic_specific;
 >   	int rc;
 >
 > -	hdev->asic_prop.supports_advanced_cpucp_rc = true;
 > -
 >   	rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS,
 >   					gaudi2->virt_msix_db_dma_addr);
 >   	if (rc) {

Is this change in support of the others in this patch? Feels like this 
should be more than one patch (adding new version_cmp, removing old checks).

-Carl V.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 02/13] accel/habanalabs: remove hop size from asic properties
  2024-02-20 16:01 ` [PATCH 02/13] accel/habanalabs: remove hop size from asic properties Oded Gabbay
@ 2024-02-23 22:42   ` Carl Vanderlip
  0 siblings, 0 replies; 28+ messages in thread
From: Carl Vanderlip @ 2024-02-23 22:42 UTC (permalink / raw)
  To: Oded Gabbay, dri-devel, linux-kernel; +Cc: Farah Kassabri


On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Farah Kassabri <fkassabri@habana.ai>
> 
> The hop size related properties is a MMU properties and not
> asic properties.
> As for PMMU and HMMU we could have different sizes.
> 
> Signed-off-by: Farah Kassabri <fkassabri@habana.ai>
> Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>

Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 03/13] accel/habanalabs: modify print for skip loading linux FW to debug log
  2024-02-20 16:01 ` [PATCH 03/13] accel/habanalabs: modify print for skip loading linux FW to debug log Oded Gabbay
@ 2024-02-23 22:45   ` Carl Vanderlip
  0 siblings, 0 replies; 28+ messages in thread
From: Carl Vanderlip @ 2024-02-23 22:45 UTC (permalink / raw)
  To: Oded Gabbay, dri-devel, linux-kernel; +Cc: Tomer Tayar


On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Tomer Tayar <ttayar@habana.ai>
> 
> Skip loading a linux FW image into the device with the current supported
> ASICs is done for test purposes only.
> Moreover, for future supported ASICs it is possible that there won't be
> a need to load such an image.
> The print in such a case is therefore not needed in most cases, so
> replace the used dev_info() with dev_dbg().
> 
> Signed-off-by: Tomer Tayar <ttayar@habana.ai>
> Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>

Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 04/13] accel/habanalabs/gaudi2: check extended errors according to PCIe addr_dec interrupt info
  2024-02-20 16:01 ` [PATCH 04/13] accel/habanalabs/gaudi2: check extended errors according to PCIe addr_dec interrupt info Oded Gabbay
@ 2024-02-23 22:49   ` Carl Vanderlip
  0 siblings, 0 replies; 28+ messages in thread
From: Carl Vanderlip @ 2024-02-23 22:49 UTC (permalink / raw)
  To: Oded Gabbay, dri-devel, linux-kernel; +Cc: Tomer Tayar

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Tomer Tayar <ttayar@habana.ai>
> 
> The FW interrupt info for a PCIe addr_dec event is set correctly, so
> check for either global errors or razwi according to the indications
> there.
> 
> Signed-off-by: Tomer Tayar <ttayar@habana.ai>
> Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>

Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 05/13] accel/habanalabs: fix glbl error cause handling
  2024-02-20 16:01 ` [PATCH 05/13] accel/habanalabs: fix glbl error cause handling Oded Gabbay
@ 2024-02-23 23:01   ` Carl Vanderlip
  0 siblings, 0 replies; 28+ messages in thread
From: Carl Vanderlip @ 2024-02-23 23:01 UTC (permalink / raw)
  To: Oded Gabbay, dri-devel, linux-kernel; +Cc: Tomer Tayar

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Tomer Tayar <ttayar@habana.ai>
> 
> The glbl error cause handling has a wrong assumption that all error
> bits are consecutive.
> Fix the handling to check all relevant error bits per ASIC.
> 
> Signed-off-by: Tomer Tayar <ttayar@habana.ai>
> Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>

Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 06/13] accel/habanalabs: fix debugfs files permissions
  2024-02-20 16:01 ` [PATCH 06/13] accel/habanalabs: fix debugfs files permissions Oded Gabbay
@ 2024-02-23 23:10   ` Carl Vanderlip
  0 siblings, 0 replies; 28+ messages in thread
From: Carl Vanderlip @ 2024-02-23 23:10 UTC (permalink / raw)
  To: Oded Gabbay, dri-devel, linux-kernel; +Cc: Avri Kehat

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Avri Kehat <akehat@habana.ai>
> 
> debugfs files are created with permissions that don't align
> with the access requirements.
> 
> Signed-off-by: Avri Kehat <akehat@habana.ai>
> Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>

Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 07/13] accel/habanalabs: initialize maybe-uninitialized variables
  2024-02-20 16:01 ` [PATCH 07/13] accel/habanalabs: initialize maybe-uninitialized variables Oded Gabbay
@ 2024-02-23 23:12   ` Carl Vanderlip
  0 siblings, 0 replies; 28+ messages in thread
From: Carl Vanderlip @ 2024-02-23 23:12 UTC (permalink / raw)
  To: Oded Gabbay, dri-devel, linux-kernel; +Cc: Tal Risin

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Tal Risin <trisin@habana.ai>
> 
> Prevent static analysis warning.
> 
> Signed-off-by: Tal Risin <trisin@habana.ai>
> Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>

Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 08/13] accel/habanalabs: fix error print
  2024-02-20 16:01 ` [PATCH 08/13] accel/habanalabs: fix error print Oded Gabbay
@ 2024-02-23 23:15   ` Carl Vanderlip
  0 siblings, 0 replies; 28+ messages in thread
From: Carl Vanderlip @ 2024-02-23 23:15 UTC (permalink / raw)
  To: Oded Gabbay, dri-devel, linux-kernel; +Cc: Dani Liberman

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Dani Liberman <dliberman@habana.ai>
> 
> The unmasking is for event and it can be other event than RAZWI.
> 
> Signed-off-by: Dani Liberman <dliberman@habana.ai>
> Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>

Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 09/13] accel/habanalabs/gaudi2: drain event lacks rd/wr indication
  2024-02-20 16:01 ` [PATCH 09/13] accel/habanalabs/gaudi2: drain event lacks rd/wr indication Oded Gabbay
@ 2024-02-23 23:17   ` Carl Vanderlip
  0 siblings, 0 replies; 28+ messages in thread
From: Carl Vanderlip @ 2024-02-23 23:17 UTC (permalink / raw)
  To: Oded Gabbay, dri-devel, linux-kernel; +Cc: Ofir Bitton

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Ofir Bitton <obitton@habana.ai>
> 
> Due to a H/W issue, AXI drain event does not include a read/write
> indication, hence we remove this print.
> 
> Signed-off-by: Ofir Bitton <obitton@habana.ai>
> Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>

Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 10/13] accel/habanalabs/hwmon: rate limit errors user can generate
  2024-02-20 16:01 ` [PATCH 10/13] accel/habanalabs/hwmon: rate limit errors user can generate Oded Gabbay
@ 2024-02-23 23:21   ` Carl Vanderlip
  0 siblings, 0 replies; 28+ messages in thread
From: Carl Vanderlip @ 2024-02-23 23:21 UTC (permalink / raw)
  To: Oded Gabbay, dri-devel, linux-kernel; +Cc: Ofir Bitton

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Ofir Bitton <obitton@habana.ai>
> 
> Fetching sensor data can fail due to various reasons. In order
> not to pollute the kernel log, those error prints must be
> rate limited.
> 
> Signed-off-by: Ofir Bitton <obitton@habana.ai>
> Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>

Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 11/13] accel/habanalabs: handle reserved memory request when working with full FW
  2024-02-20 16:01 ` [PATCH 11/13] accel/habanalabs: handle reserved memory request when working with full FW Oded Gabbay
@ 2024-02-23 23:24   ` Carl Vanderlip
  0 siblings, 0 replies; 28+ messages in thread
From: Carl Vanderlip @ 2024-02-23 23:24 UTC (permalink / raw)
  To: Oded Gabbay, dri-devel, linux-kernel; +Cc: Tomer Tayar

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Tomer Tayar <ttayar@habana.ai>
> 
> Currently the reserved memory request from FW is handled when running
> with preboot only, but this request is relevant also when running with
> full FW.
> Modify to always handle this reservation request.
> 
> Signed-off-by: Tomer Tayar <ttayar@habana.ai>
> Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>

Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 12/13] accel/habanalabs: keep explicit size of reserved memory for FW
  2024-02-20 16:01 ` [PATCH 12/13] accel/habanalabs: keep explicit size of reserved memory for FW Oded Gabbay
@ 2024-02-23 23:26   ` Carl Vanderlip
  0 siblings, 0 replies; 28+ messages in thread
From: Carl Vanderlip @ 2024-02-23 23:26 UTC (permalink / raw)
  To: Oded Gabbay, dri-devel, linux-kernel; +Cc: Tomer Tayar

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Tomer Tayar <ttayar@habana.ai>
> 
> The reserved memory for FW is currently saved in an ASIC property in
> units of MB, just like the value that comes from FW.
> Except the fact that it is not clear from the property's name, it means
> also that a calculation to actual size is required everywhere that it is
> used.
> Modify the property to hold the size in bytes.
> 
> Signed-off-by: Tomer Tayar <ttayar@habana.ai>
> Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>

Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 13/13] accel/habanalabs: modify pci health check
  2024-02-20 16:01 ` [PATCH 13/13] accel/habanalabs: modify pci health check Oded Gabbay
@ 2024-02-23 23:32   ` Carl Vanderlip
  2024-02-24 19:20     ` Ofir Bitton
  0 siblings, 1 reply; 28+ messages in thread
From: Carl Vanderlip @ 2024-02-23 23:32 UTC (permalink / raw)
  To: Oded Gabbay, dri-devel, linux-kernel; +Cc: Ofir Bitton

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Ofir Bitton <obitton@habana.ai>
> 
> Today we read PCI VENDOR-ID in order to make sure PCI link is
> healthy. Apparently the VENDOR-ID might be stored on host and
> hence, when we read it we might not access the PCI bus.
> In order to make sure PCI health check is reliable, we will start
> checking the DEVICE-ID instead.

What's keeping some system from caching that as well?

Since this is checking for PCIe link health, it will be 0xFF when bad. 
Checking some part of Config Space that is writable would be more reliable.

-Carl V.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 13/13] accel/habanalabs: modify pci health check
  2024-02-23 23:32   ` Carl Vanderlip
@ 2024-02-24 19:20     ` Ofir Bitton
  0 siblings, 0 replies; 28+ messages in thread
From: Ofir Bitton @ 2024-02-24 19:20 UTC (permalink / raw)
  To: Carl Vanderlip, Oded Gabbay, dri-devel, linux-kernel

On 24/02/2024 1:32, Carl Vanderlip wrote:
> 
> On 2/20/2024 8:01 AM, Oded Gabbay wrote:
>> From: Ofir Bitton <obitton@habana.ai>
>>
>> Today we read PCI VENDOR-ID in order to make sure PCI link is
>> healthy. Apparently the VENDOR-ID might be stored on host and
>> hence, when we read it we might not access the PCI bus.
>> In order to make sure PCI health check is reliable, we will start
>> checking the DEVICE-ID instead.
> 
> What's keeping some system from caching that as well?

The PCI Controllers/switches we use in Gaudi family products might cache 
only the VENDOR-ID and not the DEVICE-ID.

> 
> Since this is checking for PCIe link health, it will be 0xFF when bad.
> Checking some part of Config Space that is writable would be more reliable.

Generally speaking I agree but there is no product in the Gaudi family 
with a 0xFF DEVICE-ID (nor there will be), so I think this approach is 
good enough for our use-case.

--
Ofir

> 
> -Carl V.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions
  2024-02-23 22:38 ` [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Carl Vanderlip
@ 2024-02-25 10:36   ` Ohad Sharabi
  0 siblings, 0 replies; 28+ messages in thread
From: Ohad Sharabi @ 2024-02-25 10:36 UTC (permalink / raw)
  To: Carl Vanderlip, Oded Gabbay, dri-devel, linux-kernel

On 24/02/2024 0:38, Carl Vanderlip wrote:
> [You don't often get email from quic_carlv@quicinc.com. Learn why this 
> is important at https://aka.ms/LearnAboutSenderIdentification ]
>
> On 2/20/2024 8:01 AM, Oded Gabbay wrote:> From: Ohad Sharabi
> <osharabi@habana.ai>
> >
> > Currently, the code contains 2 types of FW version comparison 
> functions:
> > - hl_is_fw_sw_ver_[below/equal_or_greater]()
> > - gaudi2 specific function of the type
> >    gaudi2_is_fw_ver_[below/above]x_y_z()
> >
> > Moreover, some functions use the inner FW version which should be only
> > stage during development but not version dependencies.
> >
> > Finally, some tests are done to deprecated FW version to which LKD
> > should hold no compatibility.
> >
> > This commit aligns all APIs to a single function that just compares the
> > version and return an integers indicator (similar in some way to
> > strcmp()).
> >
> > In addition, this generic function now considers also the sub-minor FW
> > version and also remove dead code resulting in deprecated FW versions
> > compatibility.
> >
> > Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
> > Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> > Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
> > ---
> >   drivers/accel/habanalabs/common/firmware_if.c | 25 ++++++++
> >   drivers/accel/habanalabs/common/habanalabs.h  | 20 +------
> >   drivers/accel/habanalabs/gaudi2/gaudi2.c      | 57 
> +++----------------
> >   3 files changed, 34 insertions(+), 68 deletions(-)
> >
> ...
> > diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c
> b/drivers/accel/habanalabs/gaudi2/gaudi2.c
> > index 1f061209ae21..4a0917aa4dd7 100644
> > --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
> > +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
> > @@ -2601,6 +2601,8 @@ static int gaudi2_set_fixed_properties(struct
> hl_device *hdev)
> >
> >      prop->hbw_flush_reg = mmPCIE_WRAP_SPECIAL_GLBL_SPARE_0;
> >
> > +    prop->supports_advanced_cpucp_rc = true;
> > +
> >      return 0;
> >
> >   free_qprops:
> > @@ -3308,8 +3310,6 @@ static int gaudi2_late_init(struct hl_device 
> *hdev)
> >      struct gaudi2_device *gaudi2 = hdev->asic_specific;
> >      int rc;
> >
> > -    hdev->asic_prop.supports_advanced_cpucp_rc = true;
> > -
> >      rc = hl_fw_send_pci_access_msg(hdev, 
> CPUCP_PACKET_ENABLE_PCI_ACCESS,
> > gaudi2->virt_msix_db_dma_addr);
> >      if (rc) {

Carl,

Sure, we'll split the patches

Ohad

>
> Is this change in support of the others in this patch? Feels like this
> should be more than one patch (adding new version_cmp, removing old 
> checks).
>
> -Carl V.



^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2024-02-25 11:44 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-20 16:01 [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Oded Gabbay
2024-02-20 16:01 ` [PATCH 02/13] accel/habanalabs: remove hop size from asic properties Oded Gabbay
2024-02-23 22:42   ` Carl Vanderlip
2024-02-20 16:01 ` [PATCH 03/13] accel/habanalabs: modify print for skip loading linux FW to debug log Oded Gabbay
2024-02-23 22:45   ` Carl Vanderlip
2024-02-20 16:01 ` [PATCH 04/13] accel/habanalabs/gaudi2: check extended errors according to PCIe addr_dec interrupt info Oded Gabbay
2024-02-23 22:49   ` Carl Vanderlip
2024-02-20 16:01 ` [PATCH 05/13] accel/habanalabs: fix glbl error cause handling Oded Gabbay
2024-02-23 23:01   ` Carl Vanderlip
2024-02-20 16:01 ` [PATCH 06/13] accel/habanalabs: fix debugfs files permissions Oded Gabbay
2024-02-23 23:10   ` Carl Vanderlip
2024-02-20 16:01 ` [PATCH 07/13] accel/habanalabs: initialize maybe-uninitialized variables Oded Gabbay
2024-02-23 23:12   ` Carl Vanderlip
2024-02-20 16:01 ` [PATCH 08/13] accel/habanalabs: fix error print Oded Gabbay
2024-02-23 23:15   ` Carl Vanderlip
2024-02-20 16:01 ` [PATCH 09/13] accel/habanalabs/gaudi2: drain event lacks rd/wr indication Oded Gabbay
2024-02-23 23:17   ` Carl Vanderlip
2024-02-20 16:01 ` [PATCH 10/13] accel/habanalabs/hwmon: rate limit errors user can generate Oded Gabbay
2024-02-23 23:21   ` Carl Vanderlip
2024-02-20 16:01 ` [PATCH 11/13] accel/habanalabs: handle reserved memory request when working with full FW Oded Gabbay
2024-02-23 23:24   ` Carl Vanderlip
2024-02-20 16:01 ` [PATCH 12/13] accel/habanalabs: keep explicit size of reserved memory for FW Oded Gabbay
2024-02-23 23:26   ` Carl Vanderlip
2024-02-20 16:01 ` [PATCH 13/13] accel/habanalabs: modify pci health check Oded Gabbay
2024-02-23 23:32   ` Carl Vanderlip
2024-02-24 19:20     ` Ofir Bitton
2024-02-23 22:38 ` [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions Carl Vanderlip
2024-02-25 10:36   ` Ohad Sharabi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.