linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 01/12] habanalabs/gaudi2: reset device upon critical ECC event
@ 2022-07-04  9:29 Oded Gabbay
  2022-07-04  9:29 ` [PATCH 02/12] habanalabs: wait for preboot ready after hard reset Oded Gabbay
                   ` (10 more replies)
  0 siblings, 11 replies; 12+ messages in thread
From: Oded Gabbay @ 2022-07-04  9:29 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Correctable ECC events are not fatal, but as they accumulate, the f/w
can decide that a hard-rest is required. This indication is
propagated to the host using the existing ECC event interface.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c       | 25 +++++++++++--------
 .../misc/habanalabs/include/common/cpucp_if.h |  2 +-
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index edcf23b314a7..dbbd08600a56 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -6637,7 +6637,7 @@ static void gaudi2_print_irq_info(struct hl_device *hdev, u16 event_type)
 								event_type, desc);
 }
 
-static void gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
+static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
 		struct hl_eq_ecc_data *ecc_data)
 {
 	u64 ecc_address = 0, ecc_syndrom = 0;
@@ -6647,8 +6647,11 @@ static void gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
 	ecc_syndrom = le64_to_cpu(ecc_data->ecc_syndrom);
 	memory_wrapper_idx = ecc_data->memory_wrapper_idx;
 
-	dev_err(hdev->dev, "ECC error detected. address: %#llx. Syndrom: %#llx. block id %u\n",
-		ecc_address, ecc_syndrom, memory_wrapper_idx);
+	dev_err(hdev->dev,
+		"ECC error detected. address: %#llx. Syndrom: %#llx. block id %u. critical %u.\n",
+		ecc_address, ecc_syndrom, memory_wrapper_idx, ecc_data->is_critical);
+
+	return !!ecc_data->is_critical;
 }
 
 /*
@@ -7991,9 +7994,9 @@ static bool gaudi2_handle_hbm_mc_sei_err(struct hl_device *hdev, u16 event_type,
 	}
 
 	dev_err_ratelimited(hdev->dev,
-			"System Error Interrupt - HBM(%u) MC(%u) MC_CH(%u) MC_PC(%u). Error cause: %s\n",
-			hbm_id, mc_id, sei_data->hdr.mc_channel, sei_data->hdr.mc_pseudo_channel,
-			hbm_mc_sei_cause[cause_idx]);
+		"System Error Interrupt - HBM(%u) MC(%u) MC_CH(%u) MC_PC(%u). Critical(%u). Error cause: %s\n",
+		hbm_id, mc_id, sei_data->hdr.mc_channel, sei_data->hdr.mc_pseudo_channel,
+		sei_data->hdr.is_critical, hbm_mc_sei_cause[cause_idx]);
 
 	/* Print error-specific info */
 	switch (cause_idx) {
@@ -8032,6 +8035,8 @@ static bool gaudi2_handle_hbm_mc_sei_err(struct hl_device *hdev, u16 event_type,
 		break;
 	};
 
+	require_hard_reset |= !!sei_data->hdr.is_critical;
+
 	return require_hard_reset;
 }
 
@@ -8199,7 +8204,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 {
 	u32 ctl, reset_flags = HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY;
 	struct gaudi2_device *gaudi2 = hdev->asic_specific;
-	bool hbm_require_reset = false, skip_reset = false;
+	bool reset_required = false, skip_reset = false;
 	int index, sbte_index;
 	u16 event_type;
 
@@ -8222,7 +8227,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		fallthrough;
 	case GAUDI2_EVENT_ROTATOR0_SERR ... GAUDI2_EVENT_ROTATOR1_DERR:
 		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
-		gaudi2_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
+		reset_required = gaudi2_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
 		break;
 
 	case GAUDI2_EVENT_TPC0_QM ... GAUDI2_EVENT_PDMA1_QM:
@@ -8387,7 +8392,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 	case GAUDI2_EVENT_HBM0_MC0_SEI_SEVERE ... GAUDI2_EVENT_HBM5_MC1_SEI_NON_SEVERE:
 		if (gaudi2_handle_hbm_mc_sei_err(hdev, event_type, &eq_entry->sei_data)) {
 			reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
-			hbm_require_reset = true;
+			reset_required = true;
 		}
 		break;
 
@@ -8539,7 +8544,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 						event_type);
 	}
 
-	if ((gaudi2_irq_map_table[event_type].reset || hbm_require_reset) && !skip_reset)
+	if ((gaudi2_irq_map_table[event_type].reset || reset_required) && !skip_reset)
 		goto reset_device;
 
 	/* Send unmask irq only for interrupts not classified as MSG */
diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h
index 719b2ff80985..abf40e1c4965 100644
--- a/drivers/misc/habanalabs/include/common/cpucp_if.h
+++ b/drivers/misc/habanalabs/include/common/cpucp_if.h
@@ -192,7 +192,7 @@ struct hl_hbm_sei_header {
 	__u8 sei_cause;		/* enum hl_hbm_sei_cause */
 	__u8 mc_channel;		/* range: 0-3 */
 	__u8 mc_pseudo_channel;	/* range: 0-7 */
-	__u8 pad[1];
+	__u8 is_critical;
 };
 
 #define HBM_RD_ADDR_SID_SHIFT		0
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 02/12] habanalabs: wait for preboot ready after hard reset
  2022-07-04  9:29 [PATCH 01/12] habanalabs/gaudi2: reset device upon critical ECC event Oded Gabbay
@ 2022-07-04  9:29 ` Oded Gabbay
  2022-07-04  9:29 ` [PATCH 03/12] habanalabs: naming refactor of user interrupt flow Oded Gabbay
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2022-07-04  9:29 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ohad Sharabi

From: Ohad Sharabi <osharabi@habana.ai>

Currently we are not waiting for preboot ready after hard reset.
This leads to a race in which COMMs protocol begins but will get no
response from the f/w.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c | 65 +++++++++++++-------
 drivers/misc/habanalabs/common/habanalabs.h  | 27 ++++++--
 drivers/misc/habanalabs/gaudi/gaudi.c        | 19 ++++--
 drivers/misc/habanalabs/gaudi2/gaudi2.c      | 19 ++++--
 drivers/misc/habanalabs/goya/goya.c          | 19 ++++--
 5 files changed, 107 insertions(+), 42 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index 9f0a24ee5af4..64c5cdfc6dcf 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -1245,15 +1245,10 @@ static void detect_cpu_boot_status(struct hl_device *hdev, u32 status)
 	}
 }
 
-static int hl_fw_read_preboot_caps(struct hl_device *hdev,
-					u32 cpu_boot_status_reg,
-					u32 sts_boot_dev_sts0_reg,
-					u32 sts_boot_dev_sts1_reg,
-					u32 boot_err0_reg, u32 boot_err1_reg,
-					u32 timeout)
+static int hl_fw_wait_preboot_ready(struct hl_device *hdev)
 {
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	u32 status, reg_val;
+	struct pre_fw_load_props *pre_fw_load = &hdev->fw_loader.pre_fw_load;
+	u32 status;
 	int rc;
 
 	/* Need to check two possible scenarios:
@@ -1266,13 +1261,13 @@ static int hl_fw_read_preboot_caps(struct hl_device *hdev,
 	 */
 	rc = hl_poll_timeout(
 		hdev,
-		cpu_boot_status_reg,
+		pre_fw_load->cpu_boot_status_reg,
 		status,
 		(status == CPU_BOOT_STATUS_NIC_FW_RDY) ||
 		(status == CPU_BOOT_STATUS_READY_TO_BOOT) ||
 		(status == CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT),
 		hdev->fw_poll_interval_usec,
-		timeout);
+		pre_fw_load->wait_for_preboot_timeout);
 
 	if (rc) {
 		dev_err(hdev->dev, "CPU boot ready status timeout\n");
@@ -1282,12 +1277,32 @@ static int hl_fw_read_preboot_caps(struct hl_device *hdev,
 		 * of reading specific errors
 		 */
 		if (status != -1)
-			fw_read_errors(hdev, boot_err0_reg, boot_err1_reg,
-							sts_boot_dev_sts0_reg,
-							sts_boot_dev_sts1_reg);
+			fw_read_errors(hdev, pre_fw_load->boot_err0_reg,
+						pre_fw_load->boot_err1_reg,
+						pre_fw_load->sts_boot_dev_sts0_reg,
+						pre_fw_load->sts_boot_dev_sts1_reg);
 		return -EIO;
 	}
 
+	hdev->fw_loader.fw_comp_loaded |= FW_TYPE_PREBOOT_CPU;
+
+	return 0;
+}
+
+static int hl_fw_read_preboot_caps(struct hl_device *hdev)
+{
+	struct pre_fw_load_props *pre_fw_load;
+	struct asic_fixed_properties *prop;
+	u32 reg_val;
+	int rc;
+
+	prop = &hdev->asic_prop;
+	pre_fw_load = &hdev->fw_loader.pre_fw_load;
+
+	rc = hl_fw_wait_preboot_ready(hdev);
+	if (rc)
+		return rc;
+
 	/*
 	 * the registers DEV_STS* contain FW capabilities/features.
 	 * We can rely on this registers only if bit CPU_BOOT_DEV_STS*_ENABLED
@@ -1298,13 +1313,13 @@ static int hl_fw_read_preboot_caps(struct hl_device *hdev,
 	 * In case it is not enabled the stored value will be left 0- all
 	 * caps/features are off
 	 */
-	reg_val = RREG32(sts_boot_dev_sts0_reg);
+	reg_val = RREG32(pre_fw_load->sts_boot_dev_sts0_reg);
 	if (reg_val & CPU_BOOT_DEV_STS0_ENABLED) {
 		prop->fw_cpu_boot_dev_sts0_valid = true;
 		prop->fw_preboot_cpu_boot_dev_sts0 = reg_val;
 	}
 
-	reg_val = RREG32(sts_boot_dev_sts1_reg);
+	reg_val = RREG32(pre_fw_load->sts_boot_dev_sts1_reg);
 	if (reg_val & CPU_BOOT_DEV_STS1_ENABLED) {
 		prop->fw_cpu_boot_dev_sts1_valid = true;
 		prop->fw_preboot_cpu_boot_dev_sts1 = reg_val;
@@ -1447,24 +1462,21 @@ static int hl_fw_static_read_preboot_status(struct hl_device *hdev)
 	return 0;
 }
 
-int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg,
-				u32 sts_boot_dev_sts0_reg,
-				u32 sts_boot_dev_sts1_reg, u32 boot_err0_reg,
-				u32 boot_err1_reg, u32 timeout)
+int hl_fw_read_preboot_status(struct hl_device *hdev)
 {
 	int rc;
 
 	if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU))
 		return 0;
 
+	/* get FW pre-load parameters  */
+	hdev->asic_funcs->init_firmware_preload_params(hdev);
+
 	/*
 	 * In order to determine boot method (static VS dymanic) we need to
 	 * read the boot caps register
 	 */
-	rc = hl_fw_read_preboot_caps(hdev, cpu_boot_status_reg,
-					sts_boot_dev_sts0_reg,
-					sts_boot_dev_sts1_reg, boot_err0_reg,
-					boot_err1_reg, timeout);
+	rc = hl_fw_read_preboot_caps(hdev);
 	if (rc)
 		return rc;
 
@@ -2454,6 +2466,13 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 	 */
 	dyn_regs = &fw_loader->dynamic_loader.comm_desc.cpu_dyn_regs;
 
+	/* if no preboot loaded indication- wait for preboot */
+	if (!(hdev->fw_loader.fw_comp_loaded & FW_TYPE_PREBOOT_CPU)) {
+		rc = hl_fw_wait_preboot_ready(hdev);
+		if (rc)
+			return -EIO;
+	}
+
 	rc = hl_fw_dynamic_send_protocol_cmd(hdev, fw_loader, COMMS_RST_STATE,
 						0, true,
 						fw_loader->cpu_timeout);
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 6f92e3088375..8c38c2c1b1dc 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1307,6 +1307,24 @@ struct dynamic_fw_load_mgr {
 	bool fw_desc_valid;
 };
 
+/**
+ * struct pre_fw_load_props - needed properties for pre-FW load
+ * @cpu_boot_status_reg: cpu_boot_status register address
+ * @sts_boot_dev_sts0_reg: sts_boot_dev_sts0 register address
+ * @sts_boot_dev_sts1_reg: sts_boot_dev_sts1 register address
+ * @boot_err0_reg: boot_err0 register address
+ * @boot_err1_reg: boot_err1 register address
+ * @wait_for_preboot_timeout: timeout to poll for preboot ready
+ */
+struct pre_fw_load_props {
+	u32 cpu_boot_status_reg;
+	u32 sts_boot_dev_sts0_reg;
+	u32 sts_boot_dev_sts1_reg;
+	u32 boot_err0_reg;
+	u32 boot_err1_reg;
+	u32 wait_for_preboot_timeout;
+};
+
 /**
  * struct fw_image_props - properties of FW image
  * @image_name: name of the image
@@ -1323,6 +1341,7 @@ struct fw_image_props {
  * struct fw_load_mgr - manager FW loading process
  * @dynamic_loader: specific structure for dynamic load
  * @static_loader: specific structure for static load
+ * @pre_fw_load_props: parameter for pre FW load
  * @boot_fit_img: boot fit image properties
  * @linux_img: linux image properties
  * @cpu_timeout: CPU response timeout in usec
@@ -1338,6 +1357,7 @@ struct fw_load_mgr {
 		struct dynamic_fw_load_mgr dynamic_loader;
 		struct static_fw_load_mgr static_loader;
 	};
+	struct pre_fw_load_props pre_fw_load;
 	struct fw_image_props boot_fit_img;
 	struct fw_image_props linux_img;
 	u32 cpu_timeout;
@@ -1467,6 +1487,7 @@ struct hl_cs;
  * @get_msi_info: Retrieve asic-specific MSI ID of the f/w async event
  * @map_pll_idx_to_fw_idx: convert driver specific per asic PLL index to
  *                         generic f/w compatible PLL Indexes
+ * @init_firmware_preload_params: initialize pre FW-load parameters.
  * @init_firmware_loader: initialize data for FW loader.
  * @init_cpu_scrambler_dram: Enable CPU specific DRAM scrambling
  * @state_dump_init: initialize constants required for state dump
@@ -1599,6 +1620,7 @@ struct hl_asic_funcs {
 	int (*ack_mmu_errors)(struct hl_device *hdev, u64 mmu_cap_mask);
 	void (*get_msi_info)(__le32 *table);
 	int (*map_pll_idx_to_fw_idx)(u32 pll_idx);
+	void (*init_firmware_preload_params)(struct hl_device *hdev);
 	void (*init_firmware_loader)(struct hl_device *hdev);
 	void (*init_cpu_scrambler_dram)(struct hl_device *hdev);
 	void (*state_dump_init)(struct hl_device *hdev);
@@ -3577,10 +3599,7 @@ int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power);
 void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev);
 void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev);
 int hl_fw_init_cpu(struct hl_device *hdev);
-int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg,
-				u32 sts_boot_dev_sts0_reg,
-				u32 sts_boot_dev_sts1_reg, u32 boot_err0_reg,
-				u32 boot_err1_reg, u32 timeout);
+int hl_fw_read_preboot_status(struct hl_device *hdev);
 int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev,
 				struct fw_load_mgr *fw_loader,
 				enum comms_cmd cmd, unsigned int size,
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index dcda3ac94fbd..31a2589929f2 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -869,11 +869,7 @@ static int gaudi_early_init(struct hl_device *hdev)
 	/* Before continuing in the initialization, we need to read the preboot
 	 * version to determine whether we run with a security-enabled firmware
 	 */
-	rc = hl_fw_read_preboot_status(hdev, mmPSOC_GLOBAL_CONF_CPU_BOOT_STATUS,
-					mmCPU_BOOT_DEV_STS0,
-					mmCPU_BOOT_DEV_STS1, mmCPU_BOOT_ERR0,
-					mmCPU_BOOT_ERR1,
-					GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC);
+	rc = hl_fw_read_preboot_status(hdev);
 	if (rc) {
 		if (hdev->reset_on_preboot_fail)
 			hdev->asic_funcs->hw_fini(hdev, true, false);
@@ -3840,6 +3836,18 @@ static void gaudi_init_static_firmware_loader(struct hl_device *hdev)
 			GAUDI_CPU_RESET_WAIT_MSEC;
 }
 
+static void gaudi_init_firmware_preload_params(struct hl_device *hdev)
+{
+	struct pre_fw_load_props *pre_fw_load = &hdev->fw_loader.pre_fw_load;
+
+	pre_fw_load->cpu_boot_status_reg = mmPSOC_GLOBAL_CONF_CPU_BOOT_STATUS;
+	pre_fw_load->sts_boot_dev_sts0_reg = mmCPU_BOOT_DEV_STS0;
+	pre_fw_load->sts_boot_dev_sts1_reg = mmCPU_BOOT_DEV_STS1;
+	pre_fw_load->boot_err0_reg = mmCPU_BOOT_ERR0;
+	pre_fw_load->boot_err1_reg = mmCPU_BOOT_ERR1;
+	pre_fw_load->wait_for_preboot_timeout = GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC;
+}
+
 static void gaudi_init_firmware_loader(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
@@ -9231,6 +9239,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
 	.enable_events_from_fw = gaudi_enable_events_from_fw,
 	.ack_mmu_errors = gaudi_ack_mmu_page_fault_or_access_error,
 	.map_pll_idx_to_fw_idx = gaudi_map_pll_idx_to_fw_idx,
+	.init_firmware_preload_params = gaudi_init_firmware_preload_params,
 	.init_firmware_loader = gaudi_init_firmware_loader,
 	.init_cpu_scrambler_dram = gaudi_init_scrambler_hbm,
 	.state_dump_init = gaudi_state_dump_init,
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index dbbd08600a56..5e6df7814fb1 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -2531,11 +2531,7 @@ static int gaudi2_early_init(struct hl_device *hdev)
 	/* Before continuing in the initialization, we need to read the preboot
 	 * version to determine whether we run with a security-enabled firmware
 	 */
-	rc = hl_fw_read_preboot_status(hdev, mmPSOC_GLOBAL_CONF_CPU_BOOT_STATUS,
-					mmCPU_BOOT_DEV_STS0,
-					mmCPU_BOOT_DEV_STS1, mmCPU_BOOT_ERR0,
-					mmCPU_BOOT_ERR1,
-					GAUDI2_PREBOOT_REQ_TIMEOUT_USEC);
+	rc = hl_fw_read_preboot_status(hdev);
 	if (rc) {
 		if (hdev->reset_on_preboot_fail)
 			hdev->asic_funcs->hw_fini(hdev, true, false);
@@ -3832,6 +3828,18 @@ static void gaudi2_halt_engines(struct hl_device *hdev, bool hard_reset, bool fw
 	gaudi2_sync_irqs(hdev);
 }
 
+static void gaudi2_init_firmware_preload_params(struct hl_device *hdev)
+{
+	struct pre_fw_load_props *pre_fw_load = &hdev->fw_loader.pre_fw_load;
+
+	pre_fw_load->cpu_boot_status_reg = mmPSOC_GLOBAL_CONF_CPU_BOOT_STATUS;
+	pre_fw_load->sts_boot_dev_sts0_reg = mmCPU_BOOT_DEV_STS0;
+	pre_fw_load->sts_boot_dev_sts1_reg = mmCPU_BOOT_DEV_STS1;
+	pre_fw_load->boot_err0_reg = mmCPU_BOOT_ERR0;
+	pre_fw_load->boot_err1_reg = mmCPU_BOOT_ERR1;
+	pre_fw_load->wait_for_preboot_timeout = GAUDI2_PREBOOT_REQ_TIMEOUT_USEC;
+}
+
 static void gaudi2_init_firmware_loader(struct hl_device *hdev)
 {
 	struct fw_load_mgr *fw_loader = &hdev->fw_loader;
@@ -9762,6 +9770,7 @@ static const struct hl_asic_funcs gaudi2_funcs = {
 	.ack_mmu_errors = gaudi2_ack_mmu_page_fault_or_access_error,
 	.get_msi_info = gaudi2_get_msi_info,
 	.map_pll_idx_to_fw_idx = gaudi2_map_pll_idx_to_fw_idx,
+	.init_firmware_preload_params = gaudi2_init_firmware_preload_params,
 	.init_firmware_loader = gaudi2_init_firmware_loader,
 	.init_cpu_scrambler_dram = gaudi2_init_scrambler_hbm,
 	.state_dump_init = gaudi2_state_dump_init,
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 988fafce2a3c..0c333b42225a 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -665,11 +665,7 @@ static int goya_early_init(struct hl_device *hdev)
 	/* Before continuing in the initialization, we need to read the preboot
 	 * version to determine whether we run with a security-enabled firmware
 	 */
-	rc = hl_fw_read_preboot_status(hdev, mmPSOC_GLOBAL_CONF_CPU_BOOT_STATUS,
-					mmCPU_BOOT_DEV_STS0,
-					mmCPU_BOOT_DEV_STS1, mmCPU_BOOT_ERR0,
-					mmCPU_BOOT_ERR1,
-					GOYA_BOOT_FIT_REQ_TIMEOUT_USEC);
+	rc = hl_fw_read_preboot_status(hdev);
 	if (rc) {
 		if (hdev->reset_on_preboot_fail)
 			hdev->asic_funcs->hw_fini(hdev, true, false);
@@ -2580,6 +2576,18 @@ static void goya_init_static_firmware_loader(struct hl_device *hdev)
 	static_loader->sram_offset_mask = ~(lower_32_bits(SRAM_BASE_ADDR));
 }
 
+static void goya_init_firmware_preload_params(struct hl_device *hdev)
+{
+	struct pre_fw_load_props *pre_fw_load = &hdev->fw_loader.pre_fw_load;
+
+	pre_fw_load->cpu_boot_status_reg = mmPSOC_GLOBAL_CONF_CPU_BOOT_STATUS;
+	pre_fw_load->sts_boot_dev_sts0_reg = mmCPU_BOOT_DEV_STS0;
+	pre_fw_load->sts_boot_dev_sts1_reg = mmCPU_BOOT_DEV_STS1;
+	pre_fw_load->boot_err0_reg = mmCPU_BOOT_ERR0;
+	pre_fw_load->boot_err1_reg = mmCPU_BOOT_ERR1;
+	pre_fw_load->wait_for_preboot_timeout = GOYA_BOOT_FIT_REQ_TIMEOUT_USEC;
+}
+
 static void goya_init_firmware_loader(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
@@ -5510,6 +5518,7 @@ static const struct hl_asic_funcs goya_funcs = {
 	.enable_events_from_fw = goya_enable_events_from_fw,
 	.ack_mmu_errors = goya_ack_mmu_page_fault_or_access_error,
 	.map_pll_idx_to_fw_idx = goya_map_pll_idx_to_fw_idx,
+	.init_firmware_preload_params = goya_init_firmware_preload_params,
 	.init_firmware_loader = goya_init_firmware_loader,
 	.init_cpu_scrambler_dram = goya_cpu_init_scrambler_dram,
 	.state_dump_init = goya_state_dump_init,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 03/12] habanalabs: naming refactor of user interrupt flow
  2022-07-04  9:29 [PATCH 01/12] habanalabs/gaudi2: reset device upon critical ECC event Oded Gabbay
  2022-07-04  9:29 ` [PATCH 02/12] habanalabs: wait for preboot ready after hard reset Oded Gabbay
@ 2022-07-04  9:29 ` Oded Gabbay
  2022-07-04  9:29 ` [PATCH 04/12] habanalabs: add support for common decoder interrupts Oded Gabbay
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2022-07-04  9:29 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Current naming convention can be misleading. Hence renaming some
variables and defines in order to be more explicit.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_submission.c |  6 +++---
 drivers/misc/habanalabs/common/habanalabs.h         |  8 ++++----
 drivers/misc/habanalabs/common/irq.c                | 12 ++++++------
 drivers/misc/habanalabs/gaudi2/gaudi2.c             |  6 +++---
 include/uapi/misc/habanalabs.h                      |  7 +++++--
 5 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index e91ca31d4930..275dcb69a40e 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -1080,7 +1080,7 @@ void hl_release_pending_user_interrupts(struct hl_device *hdev)
 		wake_pending_user_interrupt_threads(interrupt);
 	}
 
-	interrupt = &hdev->common_user_interrupt;
+	interrupt = &hdev->common_user_cq_interrupt;
 	wake_pending_user_interrupt_threads(interrupt);
 }
 
@@ -3373,8 +3373,8 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 		int_idx = interrupt_id - first_interrupt + prop->user_dec_intr_count;
 		interrupt = &hdev->user_interrupt[int_idx];
 
-	} else if (interrupt_id == HL_COMMON_USER_INTERRUPT_ID) {
-		interrupt = &hdev->common_user_interrupt;
+	} else if (interrupt_id == HL_COMMON_USER_CQ_INTERRUPT_ID) {
+		interrupt = &hdev->common_user_cq_interrupt;
 	} else {
 		dev_err(hdev->dev, "invalid user interrupt %u", interrupt_id);
 		return -EINVAL;
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 8c38c2c1b1dc..9b2451f3619a 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -76,7 +76,7 @@ struct hl_fpriv;
 
 #define HL_INVALID_QUEUE		UINT_MAX
 
-#define HL_COMMON_USER_INTERRUPT_ID	0xFFF
+#define HL_COMMON_USER_CQ_INTERRUPT_ID	0xFFF
 
 #define HL_STATE_DUMP_HIST_LEN		5
 
@@ -2952,8 +2952,8 @@ struct hl_reset_info {
  * @user_interrupt: array of hl_user_interrupt. upon the corresponding user
  *                  interrupt, driver will monitor the list of fences
  *                  registered to this interrupt.
- * @common_user_interrupt: common user interrupt for all user interrupts.
- *                         upon any user interrupt, driver will monitor the
+ * @common_user_cq_interrupt: common user CQ interrupt for all user CQ interrupts.
+ *                         upon any user CQ interrupt, driver will monitor the
  *                         list of fences registered to this common structure.
  * @shadow_cs_queue: pointer to a shadow queue that holds pointers to
  *                   outstanding command submissions.
@@ -3118,7 +3118,7 @@ struct hl_device {
 	enum hl_asic_type		asic_type;
 	struct hl_cq			*completion_queue;
 	struct hl_user_interrupt	*user_interrupt;
-	struct hl_user_interrupt	common_user_interrupt;
+	struct hl_user_interrupt	common_user_cq_interrupt;
 	struct hl_cs			**shadow_cs_queue;
 	struct workqueue_struct		**cq_wq;
 	struct workqueue_struct		*eq_wq;
diff --git a/drivers/misc/habanalabs/common/irq.c b/drivers/misc/habanalabs/common/irq.c
index c1088377d1de..fd8f2bd9020e 100644
--- a/drivers/misc/habanalabs/common/irq.c
+++ b/drivers/misc/habanalabs/common/irq.c
@@ -269,7 +269,7 @@ static int handle_registration_node(struct hl_device *hdev, struct hl_user_pendi
 	return 0;
 }
 
-static void handle_user_cq(struct hl_device *hdev, struct hl_user_interrupt *user_cq)
+static void handle_user_interrupt(struct hl_device *hdev, struct hl_user_interrupt *intr)
 {
 	struct hl_user_pending_interrupt *pend, *temp_pend;
 	struct list_head *ts_reg_free_list_head = NULL;
@@ -291,8 +291,8 @@ static void handle_user_cq(struct hl_device *hdev, struct hl_user_interrupt *use
 	if (!job)
 		return;
 
-	spin_lock(&user_cq->wait_list_lock);
-	list_for_each_entry_safe(pend, temp_pend, &user_cq->wait_list_head, wait_list_node) {
+	spin_lock(&intr->wait_list_lock);
+	list_for_each_entry_safe(pend, temp_pend, &intr->wait_list_head, wait_list_node) {
 		if ((pend->cq_kernel_addr && *(pend->cq_kernel_addr) >= pend->cq_target_value) ||
 				!pend->cq_kernel_addr) {
 			if (pend->ts_reg_info.buf) {
@@ -309,7 +309,7 @@ static void handle_user_cq(struct hl_device *hdev, struct hl_user_interrupt *use
 			}
 		}
 	}
-	spin_unlock(&user_cq->wait_list_lock);
+	spin_unlock(&intr->wait_list_lock);
 
 	if (ts_reg_free_list_head) {
 		INIT_WORK(&job->free_obj, hl_ts_free_objects);
@@ -339,10 +339,10 @@ irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg)
 	 */
 	if (!user_int->is_decoder)
 		/* Handle user cq interrupts registered on all interrupts */
-		handle_user_cq(hdev, &hdev->common_user_interrupt);
+		handle_user_interrupt(hdev, &hdev->common_user_cq_interrupt);
 
 	/* Handle user cq or decoder interrupts registered on this specific irq */
-	handle_user_cq(hdev, user_int);
+	handle_user_interrupt(hdev, user_int);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 5e6df7814fb1..5f61c45e4695 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -2891,9 +2891,9 @@ static void gaudi2_user_interrupt_setup(struct hl_device *hdev)
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	int i, j, k;
 
-	/* Initialize common user interrupt */
-	HL_USR_INTR_STRUCT_INIT(hdev->common_user_interrupt, hdev, HL_COMMON_USER_INTERRUPT_ID,
-				false);
+	/* Initialize common user CQ interrupt */
+	HL_USR_INTR_STRUCT_INIT(hdev->common_user_cq_interrupt, hdev,
+				HL_COMMON_USER_CQ_INTERRUPT_ID, false);
 
 	/* User interrupts structure holds both decoder and user interrupts from various engines.
 	 * We first initialize the decoder interrupts and then we add the user interrupts.
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 77b89c537ee8..4ee24a3a13e9 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -1442,6 +1442,7 @@ union hl_cs_args {
 
 #define HL_WAIT_CS_FLAGS_INTERRUPT		0x2
 #define HL_WAIT_CS_FLAGS_INTERRUPT_MASK		0xFFF00000
+#define HL_WAIT_CS_FLAGS_ANY_CQ_INTERRUPT	0xFFF00000
 #define HL_WAIT_CS_FLAGS_MULTI_CS		0x4
 #define HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ	0x10
 #define HL_WAIT_CS_FLAGS_REGISTER_INTERRUPT	0x20
@@ -1491,8 +1492,10 @@ struct hl_wait_cs_in {
 
 	/* HL_WAIT_CS_FLAGS_*
 	 * If HL_WAIT_CS_FLAGS_INTERRUPT is set, this field should include
-	 * interrupt id according to HL_WAIT_CS_FLAGS_INTERRUPT_MASK, in order
-	 * not to specify an interrupt id ,set mask to all 1s.
+	 * interrupt id according to HL_WAIT_CS_FLAGS_INTERRUPT_MASK
+	 *
+	 * in order to wait for any CQ interrupt, set interrupt value to
+	 * HL_WAIT_CS_FLAGS_ANY_CQ_INTERRUPT.
 	 */
 	__u32 flags;
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 04/12] habanalabs: add support for common decoder interrupts
  2022-07-04  9:29 [PATCH 01/12] habanalabs/gaudi2: reset device upon critical ECC event Oded Gabbay
  2022-07-04  9:29 ` [PATCH 02/12] habanalabs: wait for preboot ready after hard reset Oded Gabbay
  2022-07-04  9:29 ` [PATCH 03/12] habanalabs: naming refactor of user interrupt flow Oded Gabbay
@ 2022-07-04  9:29 ` Oded Gabbay
  2022-07-04  9:29 ` [PATCH 05/12] habanalabs: save f/w preboot minor version Oded Gabbay
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2022-07-04  9:29 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

User application should be able to get notification for any decoder
completion. Hence, we introduce a new interface in which a user
can wait for all current decoder pending interrupts.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_submission.c | 5 +++++
 drivers/misc/habanalabs/common/habanalabs.h         | 3 +++
 drivers/misc/habanalabs/common/irq.c                | 9 +++------
 drivers/misc/habanalabs/gaudi2/gaudi2.c             | 4 ++++
 include/uapi/misc/habanalabs.h                      | 4 ++++
 5 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 275dcb69a40e..eb5f1aee15fc 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -1082,6 +1082,9 @@ void hl_release_pending_user_interrupts(struct hl_device *hdev)
 
 	interrupt = &hdev->common_user_cq_interrupt;
 	wake_pending_user_interrupt_threads(interrupt);
+
+	interrupt = &hdev->common_decoder_interrupt;
+	wake_pending_user_interrupt_threads(interrupt);
 }
 
 static void job_wq_completion(struct work_struct *work)
@@ -3375,6 +3378,8 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 
 	} else if (interrupt_id == HL_COMMON_USER_CQ_INTERRUPT_ID) {
 		interrupt = &hdev->common_user_cq_interrupt;
+	} else if (interrupt_id == HL_COMMON_DEC_INTERRUPT_ID) {
+		interrupt = &hdev->common_decoder_interrupt;
 	} else {
 		dev_err(hdev->dev, "invalid user interrupt %u", interrupt_id);
 		return -EINVAL;
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 9b2451f3619a..7e84f2ce49ae 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -77,6 +77,7 @@ struct hl_fpriv;
 #define HL_INVALID_QUEUE		UINT_MAX
 
 #define HL_COMMON_USER_CQ_INTERRUPT_ID	0xFFF
+#define HL_COMMON_DEC_INTERRUPT_ID	0xFFE
 
 #define HL_STATE_DUMP_HIST_LEN		5
 
@@ -2955,6 +2956,7 @@ struct hl_reset_info {
  * @common_user_cq_interrupt: common user CQ interrupt for all user CQ interrupts.
  *                         upon any user CQ interrupt, driver will monitor the
  *                         list of fences registered to this common structure.
+ * @common_decoder_interrupt: common decoder interrupt for all user decoder interrupts.
  * @shadow_cs_queue: pointer to a shadow queue that holds pointers to
  *                   outstanding command submissions.
  * @cq_wq: work queues of completion queues for executing work in process
@@ -3119,6 +3121,7 @@ struct hl_device {
 	struct hl_cq			*completion_queue;
 	struct hl_user_interrupt	*user_interrupt;
 	struct hl_user_interrupt	common_user_cq_interrupt;
+	struct hl_user_interrupt	common_decoder_interrupt;
 	struct hl_cs			**shadow_cs_queue;
 	struct workqueue_struct		**cq_wq;
 	struct workqueue_struct		*eq_wq;
diff --git a/drivers/misc/habanalabs/common/irq.c b/drivers/misc/habanalabs/common/irq.c
index fd8f2bd9020e..d60dafb03a8e 100644
--- a/drivers/misc/habanalabs/common/irq.c
+++ b/drivers/misc/habanalabs/common/irq.c
@@ -333,12 +333,9 @@ irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg)
 	struct hl_user_interrupt *user_int = arg;
 	struct hl_device *hdev = user_int->hdev;
 
-	/* If the interrupt is not a decoder interrupt, it means the interrupt
-	 * belongs to a user cq. In that case, before handling it, we need to handle the common
-	 * user cq
-	 */
-	if (!user_int->is_decoder)
-		/* Handle user cq interrupts registered on all interrupts */
+	if (user_int->is_decoder)
+		handle_user_interrupt(hdev, &hdev->common_decoder_interrupt);
+	else
 		handle_user_interrupt(hdev, &hdev->common_user_cq_interrupt);
 
 	/* Handle user cq or decoder interrupts registered on this specific irq */
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 5f61c45e4695..71eb767835bd 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -2895,6 +2895,10 @@ static void gaudi2_user_interrupt_setup(struct hl_device *hdev)
 	HL_USR_INTR_STRUCT_INIT(hdev->common_user_cq_interrupt, hdev,
 				HL_COMMON_USER_CQ_INTERRUPT_ID, false);
 
+	/* Initialize common decoder interrupt */
+	HL_USR_INTR_STRUCT_INIT(hdev->common_decoder_interrupt, hdev,
+				HL_COMMON_DEC_INTERRUPT_ID, true);
+
 	/* User interrupts structure holds both decoder and user interrupts from various engines.
 	 * We first initialize the decoder interrupts and then we add the user interrupts.
 	 * The only limitation is that the last decoder interrupt id must be smaller
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 4ee24a3a13e9..8c6ab71e7831 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -1443,6 +1443,7 @@ union hl_cs_args {
 #define HL_WAIT_CS_FLAGS_INTERRUPT		0x2
 #define HL_WAIT_CS_FLAGS_INTERRUPT_MASK		0xFFF00000
 #define HL_WAIT_CS_FLAGS_ANY_CQ_INTERRUPT	0xFFF00000
+#define HL_WAIT_CS_FLAGS_ANY_DEC_INTERRUPT	0xFFE00000
 #define HL_WAIT_CS_FLAGS_MULTI_CS		0x4
 #define HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ	0x10
 #define HL_WAIT_CS_FLAGS_REGISTER_INTERRUPT	0x20
@@ -1496,6 +1497,9 @@ struct hl_wait_cs_in {
 	 *
 	 * in order to wait for any CQ interrupt, set interrupt value to
 	 * HL_WAIT_CS_FLAGS_ANY_CQ_INTERRUPT.
+	 *
+	 * in order to wait for any decoder interrupt, set interrupt value to
+	 * HL_WAIT_CS_FLAGS_ANY_DEC_INTERRUPT.
 	 */
 	__u32 flags;
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 05/12] habanalabs: save f/w preboot minor version
  2022-07-04  9:29 [PATCH 01/12] habanalabs/gaudi2: reset device upon critical ECC event Oded Gabbay
                   ` (2 preceding siblings ...)
  2022-07-04  9:29 ` [PATCH 04/12] habanalabs: add support for common decoder interrupts Oded Gabbay
@ 2022-07-04  9:29 ` Oded Gabbay
  2022-07-04  9:29 ` [PATCH 06/12] habanalabs: allow detection of unsupported f/w packets Oded Gabbay
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2022-07-04  9:29 UTC (permalink / raw)
  To: linux-kernel; +Cc: Sagiv Ozeri

From: Sagiv Ozeri <sozeri@habana.ai>

We need this property for backward compatibility against the f/w.

Signed-off-by: Sagiv Ozeri <sozeri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c | 49 ++++++++++++++++----
 drivers/misc/habanalabs/common/habanalabs.h  |  4 +-
 2 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index 64c5cdfc6dcf..04ca4aaee446 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -41,7 +41,7 @@ static char *extract_fw_ver_from_str(const char *fw_str)
 	ver_offset = str - fw_str;
 
 	/* Copy until the next whitespace */
-	whitespace =  strnstr(str, " ", VERSION_MAX_LEN - ver_offset);
+	whitespace = strnstr(str, " ", VERSION_MAX_LEN - ver_offset);
 	if (!whitespace)
 		goto free_fw_ver;
 
@@ -54,6 +54,43 @@ static char *extract_fw_ver_from_str(const char *fw_str)
 	return NULL;
 }
 
+static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver)
+{
+	char major[8], minor[8], *first_dot, *second_dot;
+	int rc;
+
+	first_dot = strnstr(preboot_ver, ".", 10);
+	if (first_dot) {
+		strscpy(major, preboot_ver, first_dot - preboot_ver + 1);
+		rc = kstrtou32(major, 10, &hdev->fw_major_version);
+	} else {
+		rc = -EINVAL;
+	}
+
+	if (rc) {
+		dev_err(hdev->dev, "Error %d parsing preboot major version\n", rc);
+		goto out;
+	}
+
+	/* skip the first dot */
+	first_dot++;
+
+	second_dot = strnstr(first_dot, ".", 10);
+	if (second_dot) {
+		strscpy(minor, first_dot, second_dot - first_dot + 1);
+		rc = kstrtou32(minor, 10, &hdev->fw_minor_version);
+	} else {
+		rc = -EINVAL;
+	}
+
+	if (rc)
+		dev_err(hdev->dev, "Error %d parsing preboot minor version\n", rc);
+
+out:
+	kfree(preboot_ver);
+	return rc;
+}
+
 static int hl_request_fw(struct hl_device *hdev,
 				const struct firmware **firmware_p,
 				const char *fw_name)
@@ -2012,18 +2049,14 @@ static int hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev,
 
 		preboot_ver = extract_fw_ver_from_str(prop->preboot_ver);
 		if (preboot_ver) {
-			char major[8];
 			int rc;
 
 			dev_info(hdev->dev, "preboot version %s\n", preboot_ver);
-			sprintf(major, "%.2s", preboot_ver);
-			kfree(preboot_ver);
 
-			rc = kstrtou32(major, 10, &hdev->fw_major_version);
-			if (rc) {
-				dev_err(hdev->dev, "Error %d parsing preboot major version\n", rc);
+			/* This function takes care of freeing preboot_ver */
+			rc = extract_fw_sub_versions(hdev, preboot_ver);
+			if (rc)
 				return rc;
-			}
 		}
 
 		break;
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 7e84f2ce49ae..72cb12f2068a 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -3012,7 +3012,8 @@ struct hl_reset_info {
  * @last_error: holds information about last session in which CS timeout or razwi error occurred.
  * @reset_info: holds current device reset information.
  * @stream_master_qid_arr: pointer to array with QIDs of master streams.
- * @fw_major_version: major version of current loaded preboot
+ * @fw_major_version: major version of current loaded preboot.
+ * @fw_minor_version: minor version of current loaded preboot.
  * @dram_used_mem: current DRAM memory consumption.
  * @memory_scrub_val: the value to which the dram will be scrubbed to using cb scrub_device_dram
  * @timeout_jiffies: device CS timeout value.
@@ -3186,6 +3187,7 @@ struct hl_device {
 
 	u32				*stream_master_qid_arr;
 	u32				fw_major_version;
+	u32				fw_minor_version;
 	atomic64_t			dram_used_mem;
 	u64				memory_scrub_val;
 	u64				timeout_jiffies;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 06/12] habanalabs: allow detection of unsupported f/w packets
  2022-07-04  9:29 [PATCH 01/12] habanalabs/gaudi2: reset device upon critical ECC event Oded Gabbay
                   ` (3 preceding siblings ...)
  2022-07-04  9:29 ` [PATCH 05/12] habanalabs: save f/w preboot minor version Oded Gabbay
@ 2022-07-04  9:29 ` Oded Gabbay
  2022-07-04  9:29 ` [PATCH 07/12] habanalabs/gaudi2: remove unused variable Oded Gabbay
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2022-07-04  9:29 UTC (permalink / raw)
  To: linux-kernel

If we send a packet to the f/w, and that packet is unsupported, we
want to be able to identify this situation and possibly ignore this.

Therefore, if the f/w returned an error, we need to propagate it
to the callers in the result value, if those callers were interested
in it.

In addition, no point of printing the error code here because each
caller prints its own error with a specific message.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index 04ca4aaee446..fd8dd332a59a 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -327,11 +327,15 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 
 	rc = (tmp & CPUCP_PKT_CTL_RC_MASK) >> CPUCP_PKT_CTL_RC_SHIFT;
 	if (rc) {
-		dev_err(hdev->dev, "F/W ERROR %d for CPU packet %d\n",
-			rc,
-			(tmp & CPUCP_PKT_CTL_OPCODE_MASK)
-						>> CPUCP_PKT_CTL_OPCODE_SHIFT);
+		dev_dbg(hdev->dev, "F/W ERROR %d for CPU packet %d\n",
+			rc, (tmp & CPUCP_PKT_CTL_OPCODE_MASK) >> CPUCP_PKT_CTL_OPCODE_SHIFT);
+
+		/* propagate the return code from the f/w to the callers who want to check it */
+		if (result)
+			*result = rc;
+
 		rc = -EIO;
+
 	} else if (result) {
 		*result = le64_to_cpu(pkt->result);
 	}
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 07/12] habanalabs/gaudi2: remove unused variable
  2022-07-04  9:29 [PATCH 01/12] habanalabs/gaudi2: reset device upon critical ECC event Oded Gabbay
                   ` (4 preceding siblings ...)
  2022-07-04  9:29 ` [PATCH 06/12] habanalabs: allow detection of unsupported f/w packets Oded Gabbay
@ 2022-07-04  9:29 ` Oded Gabbay
  2022-07-04  9:29 ` [PATCH 08/12] habanalabs/gaudi2: SM mask can only be 8-bit Oded Gabbay
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2022-07-04  9:29 UTC (permalink / raw)
  To: linux-kernel; +Cc: kernel test robot

glbl_sts_clr_val was set but never used

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 71eb767835bd..6911b42e52e1 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -6802,7 +6802,7 @@ static void print_qman_data_on_err(struct hl_device *hdev, u32 qid_base, u32 str
 static void gaudi2_handle_qman_err_generic(struct hl_device *hdev, const char *qm_name,
 						u64 qman_base, u32 qid_base)
 {
-	u32 i, j, glbl_sts_val, arb_err_val, glbl_sts_clr_val, num_error_causes;
+	u32 i, j, glbl_sts_val, arb_err_val, num_error_causes;
 	u64 glbl_sts_addr, arb_err_addr;
 	char reg_desc[32];
 
@@ -6811,7 +6811,6 @@ static void gaudi2_handle_qman_err_generic(struct hl_device *hdev, const char *q
 
 	/* Iterate through all stream GLBL_ERR_STS registers + Lower CP */
 	for (i = 0 ; i < QMAN_STREAMS + 1 ; i++) {
-		glbl_sts_clr_val = 0;
 		glbl_sts_val = RREG32(glbl_sts_addr + 4 * i);
 
 		if (!glbl_sts_val)
@@ -6825,16 +6824,13 @@ static void gaudi2_handle_qman_err_generic(struct hl_device *hdev, const char *q
 			num_error_causes = GAUDI2_NUM_OF_QM_ERR_CAUSE;
 		}
 
-		for (j = 0 ; j < num_error_causes ; j++) {
-			if (glbl_sts_val & BIT(j)) {
+		for (j = 0 ; j < num_error_causes ; j++)
+			if (glbl_sts_val & BIT(j))
 				dev_err_ratelimited(hdev->dev, "%s %s. err cause: %s\n",
 						qm_name, reg_desc,
 						i == QMAN_STREAMS ?
 						gaudi2_qman_lower_cp_error_cause[j] :
 						gaudi2_qman_error_cause[j]);
-				glbl_sts_clr_val |= BIT(j);
-			}
-		}
 
 		print_qman_data_on_err(hdev, qid_base, i, qman_base);
 	}
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 08/12] habanalabs/gaudi2: SM mask can only be 8-bit
  2022-07-04  9:29 [PATCH 01/12] habanalabs/gaudi2: reset device upon critical ECC event Oded Gabbay
                   ` (5 preceding siblings ...)
  2022-07-04  9:29 ` [PATCH 07/12] habanalabs/gaudi2: remove unused variable Oded Gabbay
@ 2022-07-04  9:29 ` Oded Gabbay
  2022-07-04  9:29 ` [PATCH 09/12] habanalabs: do not set max power on a secured device Oded Gabbay
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2022-07-04  9:29 UTC (permalink / raw)
  To: linux-kernel; +Cc: kernel test robot

Otherwise, due to how we calculate it, we might fail in FIELD_PREP
checks.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 6911b42e52e1..dbf273d96b00 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -5751,8 +5751,9 @@ static void gaudi2_kdma_set_mmbp_asid(struct hl_device *hdev,
 static void gaudi2_arm_cq_monitor(struct hl_device *hdev, u32 index, u32 cq_id,
 						u32 mon_payload, u32 sync_value)
 {
+	u32 sync_group_id, mode, mon_arm;
 	int offset = index * 4;
-	u32 sync_group_id, mask, mode, mon_arm;
+	u8 mask;
 
 	/* Reset the SOB value */
 	WREG32(mmDCORE0_SYNC_MNGR_OBJS_SOB_OBJ_0 + offset, 0);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 09/12] habanalabs: do not set max power on a secured device
  2022-07-04  9:29 [PATCH 01/12] habanalabs/gaudi2: reset device upon critical ECC event Oded Gabbay
                   ` (6 preceding siblings ...)
  2022-07-04  9:29 ` [PATCH 08/12] habanalabs/gaudi2: SM mask can only be 8-bit Oded Gabbay
@ 2022-07-04  9:29 ` Oded Gabbay
  2022-07-04  9:29 ` [PATCH 10/12] habanalabs: don't declare tmp twice in same function Oded Gabbay
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2022-07-04  9:29 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Max power API is not supported in secured devices. Hence, we should
skip setting it during boot.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 9f3778c82e54..99d84b46aeb6 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -1545,7 +1545,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 			goto out_err;
 		}
 
-		hl_fw_set_max_power(hdev);
+		if (!hdev->asic_prop.fw_security_enabled)
+			hl_fw_set_max_power(hdev);
 	} else {
 		rc = hdev->asic_funcs->non_hard_reset_late_init(hdev);
 		if (rc) {
@@ -1914,7 +1915,8 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 	/* Need to call this again because the max power might change,
 	 * depending on card type for certain ASICs
 	 */
-	if (hdev->asic_prop.set_max_power_on_device_init)
+	if (hdev->asic_prop.set_max_power_on_device_init &&
+			!hdev->asic_prop.fw_security_enabled)
 		hl_fw_set_max_power(hdev);
 
 	/*
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 10/12] habanalabs: don't declare tmp twice in same function
  2022-07-04  9:29 [PATCH 01/12] habanalabs/gaudi2: reset device upon critical ECC event Oded Gabbay
                   ` (7 preceding siblings ...)
  2022-07-04  9:29 ` [PATCH 09/12] habanalabs: do not set max power on a secured device Oded Gabbay
@ 2022-07-04  9:29 ` Oded Gabbay
  2022-07-04  9:29 ` [PATCH 11/12] habanalabs: make sure variable is set before used Oded Gabbay
  2022-07-04  9:29 ` [PATCH 12/12] habanalabs/gaudi2: remove unused defines Oded Gabbay
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2022-07-04  9:29 UTC (permalink / raw)
  To: linux-kernel

tmp is declared in the scope of the function cs_do_release() and
inside a block inside that function.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_submission.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index eb5f1aee15fc..941f1ff190ae 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -718,9 +718,9 @@ static void cs_do_release(struct kref *ref)
 		 * staged submission
 		 */
 		if (cs->staged_last) {
-			struct hl_cs *staged_cs, *tmp;
+			struct hl_cs *staged_cs, *tmp_cs;
 
-			list_for_each_entry_safe(staged_cs, tmp,
+			list_for_each_entry_safe(staged_cs, tmp_cs,
 					&cs->staged_cs_node, staged_cs_node)
 				staged_cs_put(hdev, staged_cs);
 		}
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 11/12] habanalabs: make sure variable is set before used
  2022-07-04  9:29 [PATCH 01/12] habanalabs/gaudi2: reset device upon critical ECC event Oded Gabbay
                   ` (8 preceding siblings ...)
  2022-07-04  9:29 ` [PATCH 10/12] habanalabs: don't declare tmp twice in same function Oded Gabbay
@ 2022-07-04  9:29 ` Oded Gabbay
  2022-07-04  9:29 ` [PATCH 12/12] habanalabs/gaudi2: remove unused defines Oded Gabbay
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2022-07-04  9:29 UTC (permalink / raw)
  To: linux-kernel

timestamp could be unset in both _hl_interrupt_wait_ioctl() and
_hl_interrupt_wait_ioctl_user_addr() so it is better to explicitly
initialize it to 0 when declaring it.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_submission.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 941f1ff190ae..90a4574cbe2d 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -3345,8 +3345,8 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 	struct hl_user_interrupt *interrupt;
 	union hl_wait_cs_args *args = data;
 	u32 status = HL_WAIT_CS_STATUS_BUSY;
+	u64 timestamp = 0;
 	int rc, int_idx;
-	u64 timestamp;
 
 	prop = &hdev->asic_prop;
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 12/12] habanalabs/gaudi2: remove unused defines
  2022-07-04  9:29 [PATCH 01/12] habanalabs/gaudi2: reset device upon critical ECC event Oded Gabbay
                   ` (9 preceding siblings ...)
  2022-07-04  9:29 ` [PATCH 11/12] habanalabs: make sure variable is set before used Oded Gabbay
@ 2022-07-04  9:29 ` Oded Gabbay
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2022-07-04  9:29 UTC (permalink / raw)
  To: linux-kernel

There were some defines that are unused in the current upstreamed
code.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c             | 7 -------
 drivers/misc/habanalabs/gaudi2/gaudi2.c           | 4 ----
 drivers/misc/habanalabs/gaudi2/gaudi2_coresight.c | 2 --
 3 files changed, 13 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 31a2589929f2..1f84dd6f3adb 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -97,15 +97,8 @@
 
 #define GAUDI_ARB_WDT_TIMEOUT		0xEE6b27FF /* 8 seconds */
 
-#define GAUDI_CLK_GATE_DEBUGFS_MASK	(\
-		BIT(GAUDI_ENGINE_ID_MME_0) |\
-		BIT(GAUDI_ENGINE_ID_MME_2) |\
-		GENMASK_ULL(GAUDI_ENGINE_ID_TPC_7, GAUDI_ENGINE_ID_TPC_0))
-
 #define HBM_SCRUBBING_TIMEOUT_US	1000000 /* 1s */
 
-#define GAUDI_PLL_MAX 10
-
 #define BIN_REG_STRING_SIZE	sizeof("0b10101010101010101010101010101010")
 
 #define MONITOR_SOB_STRING_SIZE		256
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index dbf273d96b00..919e5028f341 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -74,10 +74,6 @@
 #define GAUDI2_NUM_OF_HIF_FATAL_ERR_CAUSE	2
 #define GAUDI2_NUM_OF_AXI_DRAIN_ERR_CAUSE	2
 #define GAUDI2_NUM_OF_HBM_MC_SPI_CAUSE		5
-#define GAUDI2_NUM_OF_NIC_RXB_CORE_SEI_CAUSE	2
-#define GAUDI2_NUM_OF_NIC_RXB_CORE_SPI_CAUSE	6
-#define GAUDI2_NUM_OF_NIC_RXE_SEI_CAUSE		4
-#define GAUDI2_NUM_OF_NIC_RXE_SPI_CAUSE		24
 
 #define GAUDI2_MMU_CACHE_INV_TIMEOUT_USEC	(MMU_CONFIG_TIMEOUT_USEC * 10)
 #define GAUDI2_PLDM_MMU_TIMEOUT_USEC		(MMU_CONFIG_TIMEOUT_USEC * 200)
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2_coresight.c b/drivers/misc/habanalabs/gaudi2/gaudi2_coresight.c
index cf70735bf1e3..56c6ab692482 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2_coresight.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2_coresight.c
@@ -9,8 +9,6 @@
 
 #define GAUDI2_PLDM_CORESIGHT_TIMEOUT_USEC	(CORESIGHT_TIMEOUT_USEC * 2000)
 #define SPMU_MAX_COUNTERS			6
-/* SPMU should also include overflow_idx and cycle_cnt_idx */
-#define SPMU_DATA_LEN				(SPMU_MAX_COUNTERS + 2)
 
 #define COMPONENT_ID_INVALID ((u32)(-1))
 #define MAX_BMONS_PER_UNIT 8
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2022-07-04  9:30 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-04  9:29 [PATCH 01/12] habanalabs/gaudi2: reset device upon critical ECC event Oded Gabbay
2022-07-04  9:29 ` [PATCH 02/12] habanalabs: wait for preboot ready after hard reset Oded Gabbay
2022-07-04  9:29 ` [PATCH 03/12] habanalabs: naming refactor of user interrupt flow Oded Gabbay
2022-07-04  9:29 ` [PATCH 04/12] habanalabs: add support for common decoder interrupts Oded Gabbay
2022-07-04  9:29 ` [PATCH 05/12] habanalabs: save f/w preboot minor version Oded Gabbay
2022-07-04  9:29 ` [PATCH 06/12] habanalabs: allow detection of unsupported f/w packets Oded Gabbay
2022-07-04  9:29 ` [PATCH 07/12] habanalabs/gaudi2: remove unused variable Oded Gabbay
2022-07-04  9:29 ` [PATCH 08/12] habanalabs/gaudi2: SM mask can only be 8-bit Oded Gabbay
2022-07-04  9:29 ` [PATCH 09/12] habanalabs: do not set max power on a secured device Oded Gabbay
2022-07-04  9:29 ` [PATCH 10/12] habanalabs: don't declare tmp twice in same function Oded Gabbay
2022-07-04  9:29 ` [PATCH 11/12] habanalabs: make sure variable is set before used Oded Gabbay
2022-07-04  9:29 ` [PATCH 12/12] habanalabs/gaudi2: remove unused defines Oded Gabbay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).