All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] habanalabs: bypass reset for continuous h/w error event
@ 2021-10-10  8:03 Oded Gabbay
  2021-10-10  8:03 ` [PATCH 2/5] habanalabs: update firmware files Oded Gabbay
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Oded Gabbay @ 2021-10-10  8:03 UTC (permalink / raw)
  To: linux-kernel; +Cc: Bharat Jauhari

From: Bharat Jauhari <bjauhari@habana.ai>

There may be a situation where drivers receives continuous fatal H/W
error events from FW immediately post reset cycle.
This may be due to some fault on the silicon itself.
In such case its better to bypass reset cycle so we won't be stuck in
endless loop of resets.

This commit bypasses reset request in case driver received two back to
back FW fatal error before first occurrence of heartbeat event.

Signed-off-by: Bharat Jauhari <bjauhari@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c       | 118 +++++++++++++-----
 drivers/misc/habanalabs/common/habanalabs.h   |  14 +++
 .../misc/habanalabs/common/habanalabs_drv.c   |   1 +
 drivers/misc/habanalabs/gaudi/gaudi.c         |   8 +-
 drivers/misc/habanalabs/goya/goya.c           |   6 +
 5 files changed, 111 insertions(+), 36 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index e1949b087ae3..2022e5d7b3ad 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -546,6 +546,19 @@ static void hl_device_heartbeat(struct work_struct *work)
 	return;
 
 reschedule:
+	/*
+	 * prev_reset_trigger tracks consecutive fatal h/w errors until first
+	 * heartbeat immediately post reset.
+	 * If control reached here, then at least one heartbeat work has been
+	 * scheduled since last reset/init cycle.
+	 * So if the device is not already in reset cycle, reset the flag
+	 * prev_reset_trigger as no reset occurred with HL_RESET_FW_FATAL_ERR
+	 * status for at least one heartbeat. From this point driver restarts
+	 * tracking future consecutive fatal errors.
+	 */
+	if (!(atomic_read(&hdev->in_reset)))
+		hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
+
 	schedule_delayed_work(&hdev->work_heartbeat,
 			usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
 }
@@ -925,6 +938,65 @@ static void device_disable_open_processes(struct hl_device *hdev)
 	mutex_unlock(&hdev->fpriv_list_lock);
 }
 
+static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
+{
+	u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
+
+	/*
+	 * 'reset cause' is being updated here, because getting here
+	 * means that it's the 1st time and the last time we're here
+	 * ('in_reset' makes sure of it). This makes sure that
+	 * 'reset_cause' will continue holding its 1st recorded reason!
+	 */
+	if (flags & HL_RESET_HEARTBEAT) {
+		hdev->curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
+		cur_reset_trigger = HL_RESET_HEARTBEAT;
+	} else if (flags & HL_RESET_TDR) {
+		hdev->curr_reset_cause = HL_RESET_CAUSE_TDR;
+		cur_reset_trigger = HL_RESET_TDR;
+	} else if (flags & HL_RESET_FW_FATAL_ERR) {
+		hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+		cur_reset_trigger = HL_RESET_FW_FATAL_ERR;
+	} else {
+		hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+	}
+
+	/*
+	 * If reset cause is same twice, then reset_trigger_repeated
+	 * is set and if this reset is due to a fatal FW error
+	 * device is set to an unstable state.
+	 */
+	if (hdev->prev_reset_trigger != cur_reset_trigger) {
+		hdev->prev_reset_trigger = cur_reset_trigger;
+		hdev->reset_trigger_repeated = 0;
+	} else {
+		hdev->reset_trigger_repeated = 1;
+	}
+
+	/* If reset is due to heartbeat, device CPU is no responsive in
+	 * which case no point sending PCI disable message to it.
+	 *
+	 * If F/W is performing the reset, no need to send it a message to disable
+	 * PCI access
+	 */
+	if ((flags & HL_RESET_HARD) &&
+			!(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) {
+		/* Disable PCI access from device F/W so he won't send
+		 * us additional interrupts. We disable MSI/MSI-X at
+		 * the halt_engines function and we can't have the F/W
+		 * sending us interrupts after that. We need to disable
+		 * the access here because if the device is marked
+		 * disable, the message won't be send. Also, in case
+		 * of heartbeat, the device CPU is marked as disable
+		 * so this message won't be sent
+		 */
+		if (hl_fw_send_pci_access_msg(hdev,
+				CPUCP_PACKET_DISABLE_PCI_ACCESS))
+			dev_warn(hdev->dev,
+				"Failed to disable PCI access by F/W\n");
+	}
+}
+
 /*
  * hl_device_reset - reset the device
  *
@@ -994,40 +1066,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		if (rc)
 			return 0;
 
-		/*
-		 * 'reset cause' is being updated here, because getting here
-		 * means that it's the 1st time and the last time we're here
-		 * ('in_reset' makes sure of it). This makes sure that
-		 * 'reset_cause' will continue holding its 1st recorded reason!
-		 */
-		if (flags & HL_RESET_HEARTBEAT)
-			hdev->curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
-		else if (flags & HL_RESET_TDR)
-			hdev->curr_reset_cause = HL_RESET_CAUSE_TDR;
-		else
-			hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
-
-		/* If reset is due to heartbeat, device CPU is no responsive in
-		 * which case no point sending PCI disable message to it.
-		 *
-		 * If F/W is performing the reset, no need to send it a message to disable
-		 * PCI access
-		 */
-		if (hard_reset && !(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) {
-			/* Disable PCI access from device F/W so he won't send
-			 * us additional interrupts. We disable MSI/MSI-X at
-			 * the halt_engines function and we can't have the F/W
-			 * sending us interrupts after that. We need to disable
-			 * the access here because if the device is marked
-			 * disable, the message won't be send. Also, in case
-			 * of heartbeat, the device CPU is marked as disable
-			 * so this message won't be sent
-			 */
-			if (hl_fw_send_pci_access_msg(hdev,
-					CPUCP_PACKET_DISABLE_PCI_ACCESS))
-				dev_warn(hdev->dev,
-					"Failed to disable PCI access by F/W\n");
-		}
+		handle_reset_trigger(hdev, flags);
 
 		/* This also blocks future CS/VM/JOB completion operations */
 		hdev->disabled = true;
@@ -1131,6 +1170,17 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		hdev->device_cpu_disabled = false;
 		hdev->hard_reset_pending = false;
 
+		if (hdev->reset_trigger_repeated &&
+				(hdev->prev_reset_trigger == HL_RESET_FW_FATAL_ERR)) {
+			/* if there 2 back to back resets from FW,
+			 * ensure driver puts the driver in a unusable state
+			 */
+			dev_crit(hdev->dev,
+				"Consecutive FW fatal errors received, stopping hard reset\n");
+			rc = -EIO;
+			goto out_err;
+		}
+
 		if (hdev->kernel_ctx) {
 			dev_crit(hdev->dev,
 				"kernel ctx was alive during hard reset, something is terribly wrong\n");
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 2d9edd734d1c..a06135155b57 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -68,6 +68,9 @@
 
 #define HL_STATE_DUMP_HIST_LEN		5
 
+/* Default value for device reset trigger , an invalid value */
+#define HL_RESET_TRIGGER_DEFAULT	0xFF
+
 #define OBJ_NAMES_HASH_TABLE_BITS	7 /* 1 << 7 buckets */
 #define SYNC_TO_ENGINE_HASH_TABLE_BITS	7 /* 1 << 7 buckets */
 
@@ -132,13 +135,18 @@ enum hl_mmu_page_table_location {
  * - HL_RESET_FW
  *       F/W will perform the reset. No need to ask it to reset the device. This is relevant
  *       only when running with secured f/w
+ *
+ * - HL_RESET_FW_FATAL_ERR
+ *       Set if reset is due to a fatal error from FW
  */
+
 #define HL_RESET_HARD			(1 << 0)
 #define HL_RESET_FROM_RESET_THREAD	(1 << 1)
 #define HL_RESET_HEARTBEAT		(1 << 2)
 #define HL_RESET_TDR			(1 << 3)
 #define HL_RESET_DEVICE_RELEASE		(1 << 4)
 #define HL_RESET_FW			(1 << 5)
+#define HL_RESET_FW_FATAL_ERR		(1 << 6)
 
 #define HL_MAX_SOBS_PER_MONITOR	8
 
@@ -2458,6 +2466,10 @@ struct multi_cs_data {
  * @supports_staged_submission: true if staged submissions are supported
  * @curr_reset_cause: saves an enumerated reset cause when a hard reset is
  *                    triggered, and cleared after it is shared with preboot.
+ * @prev_reset_trigger: saves the previous trigger which caused a reset, overidden
+ *                      with a new value on next reset
+ * @reset_trigger_repeated: set if device reset is triggered more than once with
+ *                          same cause.
  * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to
  *                         complete instead.
  * @device_cpu_is_halted: Flag to indicate whether the device CPU was already
@@ -2585,6 +2597,8 @@ struct hl_device {
 	u8				device_fini_pending;
 	u8				supports_staged_submission;
 	u8				curr_reset_cause;
+	u8				prev_reset_trigger;
+	u8				reset_trigger_repeated;
 	u8				skip_reset_on_timeout;
 	u8				device_cpu_is_halted;
 	u8				supports_wait_for_multi_cs;
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index a75e4fceb9d8..1da56069750a 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -339,6 +339,7 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 	set_driver_behavior_per_device(hdev);
 
 	hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+	hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
 
 	if (timeout_locked)
 		hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 14da87b38e83..70a668951ec4 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7932,6 +7932,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 {
 	struct gaudi_device *gaudi = hdev->asic_specific;
 	u32 ctl = le32_to_cpu(eq_entry->hdr.ctl);
+	u32 fw_fatal_err_flag = 0;
 	u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK)
 			>> EQ_CTL_EVENT_TYPE_SHIFT);
 	bool reset_required;
@@ -7972,6 +7973,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 	case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR:
 		gaudi_print_irq_info(hdev, event_type, true);
 		gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
+		fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
 		goto reset_device;
 
 	case GAUDI_EVENT_GIC500:
@@ -7979,6 +7981,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 	case GAUDI_EVENT_L2_RAM_ECC:
 	case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17:
 		gaudi_print_irq_info(hdev, event_type, false);
+		fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
 		goto reset_device;
 
 	case GAUDI_EVENT_HBM0_SPI_0:
@@ -7989,6 +7992,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 		gaudi_hbm_read_interrupts(hdev,
 				gaudi_hbm_event_to_dev(event_type),
 				&eq_entry->hbm_ecc_data);
+		fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
 		goto reset_device;
 
 	case GAUDI_EVENT_HBM0_SPI_1:
@@ -8171,9 +8175,9 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 
 reset_device:
 	if (hdev->asic_prop.fw_security_enabled)
-		hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FW);
+		hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FW | fw_fatal_err_flag);
 	else if (hdev->hard_reset_on_fw_events)
-		hl_device_reset(hdev, HL_RESET_HARD);
+		hl_device_reset(hdev, HL_RESET_HARD | fw_fatal_err_flag);
 	else
 		hl_fw_unmask_irq(hdev, event_type);
 }
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index ef67e2586ede..78cf35879680 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -4829,6 +4829,12 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
 	case GOYA_ASYNC_EVENT_ID_PLL0 ... GOYA_ASYNC_EVENT_ID_PLL6:
 	case GOYA_ASYNC_EVENT_ID_AXI_ECC:
 	case GOYA_ASYNC_EVENT_ID_L2_RAM_ECC:
+		goya_print_irq_info(hdev, event_type, false);
+		if (hdev->hard_reset_on_fw_events)
+			hl_device_reset(hdev, (HL_RESET_HARD |
+						HL_RESET_FW_FATAL_ERR));
+		break;
+
 	case GOYA_ASYNC_EVENT_ID_PSOC_GPIO_05_SW_RESET:
 		goya_print_irq_info(hdev, event_type, false);
 		if (hdev->hard_reset_on_fw_events)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/5] habanalabs: update firmware files
  2021-10-10  8:03 [PATCH 1/5] habanalabs: bypass reset for continuous h/w error event Oded Gabbay
@ 2021-10-10  8:03 ` Oded Gabbay
  2021-10-10  8:03 ` [PATCH 3/5] habanalabs: use only u32 Oded Gabbay
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Oded Gabbay @ 2021-10-10  8:03 UTC (permalink / raw)
  To: linux-kernel

Update the firmware headers to the latest version

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../misc/habanalabs/include/common/cpucp_if.h |  12 +-
 .../habanalabs/include/common/hl_boot_if.h    | 185 +++++++++++-------
 .../habanalabs/include/gaudi/gaudi_fw_if.h    |  10 +-
 .../habanalabs/include/gaudi/gaudi_reg_map.h  |   1 +
 4 files changed, 130 insertions(+), 78 deletions(-)

diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h
index a6fa1cfa38a5..ae13231fda94 100644
--- a/drivers/misc/habanalabs/include/common/cpucp_if.h
+++ b/drivers/misc/habanalabs/include/common/cpucp_if.h
@@ -542,11 +542,14 @@ enum cpucp_packet_rc {
  */
 enum cpucp_temp_type {
 	cpucp_temp_input,
+	cpucp_temp_min = 4,
+	cpucp_temp_min_hyst,
 	cpucp_temp_max = 6,
 	cpucp_temp_max_hyst,
 	cpucp_temp_crit,
 	cpucp_temp_crit_hyst,
 	cpucp_temp_offset = 19,
+	cpucp_temp_lowest = 21,
 	cpucp_temp_highest = 22,
 	cpucp_temp_reset_history = 23
 };
@@ -555,6 +558,7 @@ enum cpucp_in_attributes {
 	cpucp_in_input,
 	cpucp_in_min,
 	cpucp_in_max,
+	cpucp_in_lowest = 6,
 	cpucp_in_highest = 7,
 	cpucp_in_reset_history
 };
@@ -563,6 +567,7 @@ enum cpucp_curr_attributes {
 	cpucp_curr_input,
 	cpucp_curr_min,
 	cpucp_curr_max,
+	cpucp_curr_lowest = 6,
 	cpucp_curr_highest = 7,
 	cpucp_curr_reset_history
 };
@@ -741,6 +746,9 @@ struct cpucp_security_info {
  * @pll_map: Bit map of supported PLLs for current ASIC version.
  * @mme_binning_mask: MME binning mask,
  *                   (0 = functional, 1 = binned)
+ * @dram_binning_mask: DRAM binning mask, 1 bit per dram instance
+ *                     (0 = functional 1 = binned)
+ * @memory_repair_flag: eFuse flag indicating memory repair
  */
 struct cpucp_info {
 	struct cpucp_sensor sensors[CPUCP_MAX_SENSORS];
@@ -759,7 +767,9 @@ struct cpucp_info {
 	__le64 reserved3;
 	__le64 reserved4;
 	__u8 reserved5;
-	__u8 pad[7];
+	__u8 dram_binning_mask;
+	__u8 memory_repair_flag;
+	__u8 pad[5];
 	struct cpucp_security_info sec_info;
 	__le32 reserved6;
 	__u8 pll_map[PLL_MAP_LEN];
diff --git a/drivers/misc/habanalabs/include/common/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h
index 8837925b5d85..2626df6ef3ef 100644
--- a/drivers/misc/habanalabs/include/common/hl_boot_if.h
+++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h
@@ -15,6 +15,28 @@
 
 #define VERSION_MAX_LEN			128
 
+enum cpu_boot_err {
+	CPU_BOOT_ERR_DRAM_INIT_FAIL = 0,
+	CPU_BOOT_ERR_FIT_CORRUPTED = 1,
+	CPU_BOOT_ERR_TS_INIT_FAIL = 2,
+	CPU_BOOT_ERR_DRAM_SKIPPED = 3,
+	CPU_BOOT_ERR_BMC_WAIT_SKIPPED = 4,
+	CPU_BOOT_ERR_NIC_DATA_NOT_RDY = 5,
+	CPU_BOOT_ERR_NIC_FW_FAIL = 6,
+	CPU_BOOT_ERR_SECURITY_NOT_RDY = 7,
+	CPU_BOOT_ERR_SECURITY_FAIL = 8,
+	CPU_BOOT_ERR_EFUSE_FAIL = 9,
+	CPU_BOOT_ERR_PRI_IMG_VER_FAIL = 10,
+	CPU_BOOT_ERR_SEC_IMG_VER_FAIL = 11,
+	CPU_BOOT_ERR_PLL_FAIL = 12,
+	CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL = 13,
+	CPU_BOOT_ERR_BOOT_FW_CRIT_ERR = 18,
+	CPU_BOOT_ERR_BINNING_FAIL = 19,
+	CPU_BOOT_ERR_ENABLED = 31,
+	CPU_BOOT_ERR_SCND_EN = 63,
+	CPU_BOOT_ERR_LAST = 64 /* we have 2 registers of 32 bits */
+};
+
 /*
  * CPU error bits in BOOT_ERROR registers
  *
@@ -78,25 +100,13 @@
  * CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL	Device is unusable and customer support
  *					should be contacted.
  *
- * CPU_BOOT_ERR0_ARC0_HALT_ACK_NOT_RCVD	HALT ACK from ARC0 is not received
- *					within specified retries after issuing
- *					HALT request. ARC0 appears to be in bad
- *					reset.
- *
- * CPU_BOOT_ERR0_ARC1_HALT_ACK_NOT_RCVD	HALT ACK from ARC1 is not received
- *					within specified retries after issuing
- *					HALT request. ARC1 appears to be in bad
- *					reset.
- *
- * CPU_BOOT_ERR0_ARC0_RUN_ACK_NOT_RCVD	RUN ACK from ARC0 is not received
- *					within specified timeout after issuing
- *					RUN request. ARC0 appears to be in bad
- *					reset.
+ * CPU_BOOT_ERR0_BOOT_FW_CRIT_ERR	Critical error was detected during
+ *					the execution of ppboot or preboot.
+ *					for example: stack overflow.
  *
- * CPU_BOOT_ERR0_ARC1_RUN_ACK_NOT_RCVD	RUN ACK from ARC1 is not received
- *					within specified timeout after issuing
- *					RUN request. ARC1 appears to be in bad
- *					reset.
+ * CPU_BOOT_ERR0_BINNING_FAIL		Binning settings failed, meaning
+ *					malfunctioning components might still be
+ *					in use.
  *
  * CPU_BOOT_ERR0_ENABLED		Error registers enabled.
  *					This is a main indication that the
@@ -104,26 +114,57 @@
  *					registers. Meaning the error bits are
  *					not garbage, but actual error statuses.
  */
-#define CPU_BOOT_ERR0_DRAM_INIT_FAIL		(1 << 0)
-#define CPU_BOOT_ERR0_FIT_CORRUPTED		(1 << 1)
-#define CPU_BOOT_ERR0_TS_INIT_FAIL		(1 << 2)
-#define CPU_BOOT_ERR0_DRAM_SKIPPED		(1 << 3)
-#define CPU_BOOT_ERR0_BMC_WAIT_SKIPPED		(1 << 4)
-#define CPU_BOOT_ERR0_NIC_DATA_NOT_RDY		(1 << 5)
-#define CPU_BOOT_ERR0_NIC_FW_FAIL		(1 << 6)
-#define CPU_BOOT_ERR0_SECURITY_NOT_RDY		(1 << 7)
-#define CPU_BOOT_ERR0_SECURITY_FAIL		(1 << 8)
-#define CPU_BOOT_ERR0_EFUSE_FAIL		(1 << 9)
-#define CPU_BOOT_ERR0_PRI_IMG_VER_FAIL		(1 << 10)
-#define CPU_BOOT_ERR0_SEC_IMG_VER_FAIL		(1 << 11)
-#define CPU_BOOT_ERR0_PLL_FAIL			(1 << 12)
-#define CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL	(1 << 13)
-#define CPU_BOOT_ERR0_ARC0_HALT_ACK_NOT_RCVD	(1 << 14)
-#define CPU_BOOT_ERR0_ARC1_HALT_ACK_NOT_RCVD	(1 << 15)
-#define CPU_BOOT_ERR0_ARC0_RUN_ACK_NOT_RCVD	(1 << 16)
-#define CPU_BOOT_ERR0_ARC1_RUN_ACK_NOT_RCVD	(1 << 17)
-#define CPU_BOOT_ERR0_ENABLED			(1 << 31)
-#define CPU_BOOT_ERR1_ENABLED			(1 << 31)
+#define CPU_BOOT_ERR0_DRAM_INIT_FAIL		(1 << CPU_BOOT_ERR_DRAM_INIT_FAIL)
+#define CPU_BOOT_ERR0_FIT_CORRUPTED		(1 << CPU_BOOT_ERR_FIT_CORRUPTED)
+#define CPU_BOOT_ERR0_TS_INIT_FAIL		(1 << CPU_BOOT_ERR_TS_INIT_FAIL)
+#define CPU_BOOT_ERR0_DRAM_SKIPPED		(1 << CPU_BOOT_ERR_DRAM_SKIPPED)
+#define CPU_BOOT_ERR0_BMC_WAIT_SKIPPED		(1 << CPU_BOOT_ERR_BMC_WAIT_SKIPPED)
+#define CPU_BOOT_ERR0_NIC_DATA_NOT_RDY		(1 << CPU_BOOT_ERR_NIC_DATA_NOT_RDY)
+#define CPU_BOOT_ERR0_NIC_FW_FAIL		(1 << CPU_BOOT_ERR_NIC_FW_FAIL)
+#define CPU_BOOT_ERR0_SECURITY_NOT_RDY		(1 << CPU_BOOT_ERR_SECURITY_NOT_RDY)
+#define CPU_BOOT_ERR0_SECURITY_FAIL		(1 << CPU_BOOT_ERR_SECURITY_FAIL)
+#define CPU_BOOT_ERR0_EFUSE_FAIL		(1 << CPU_BOOT_ERR_EFUSE_FAIL)
+#define CPU_BOOT_ERR0_PRI_IMG_VER_FAIL		(1 << CPU_BOOT_ERR_PRI_IMG_VER_FAIL)
+#define CPU_BOOT_ERR0_SEC_IMG_VER_FAIL		(1 << CPU_BOOT_ERR_SEC_IMG_VER_FAIL)
+#define CPU_BOOT_ERR0_PLL_FAIL			(1 << CPU_BOOT_ERR_PLL_FAIL)
+#define CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL	(1 << CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL)
+#define CPU_BOOT_ERR0_BOOT_FW_CRIT_ERR		(1 << CPU_BOOT_ERR_BOOT_FW_CRIT_ERR)
+#define CPU_BOOT_ERR0_BINNING_FAIL		(1 << CPU_BOOT_ERR_BINNING_FAIL)
+#define CPU_BOOT_ERR0_ENABLED			(1 << CPU_BOOT_ERR_ENABLED)
+#define CPU_BOOT_ERR1_ENABLED			(1 << CPU_BOOT_ERR_ENABLED)
+
+enum cpu_boot_dev_sts {
+	CPU_BOOT_DEV_STS_SECURITY_EN = 0,
+	CPU_BOOT_DEV_STS_DEBUG_EN = 1,
+	CPU_BOOT_DEV_STS_WATCHDOG_EN = 2,
+	CPU_BOOT_DEV_STS_DRAM_INIT_EN = 3,
+	CPU_BOOT_DEV_STS_BMC_WAIT_EN = 4,
+	CPU_BOOT_DEV_STS_E2E_CRED_EN = 5,
+	CPU_BOOT_DEV_STS_HBM_CRED_EN = 6,
+	CPU_BOOT_DEV_STS_RL_EN = 7,
+	CPU_BOOT_DEV_STS_SRAM_SCR_EN = 8,
+	CPU_BOOT_DEV_STS_DRAM_SCR_EN = 9,
+	CPU_BOOT_DEV_STS_FW_HARD_RST_EN = 10,
+	CPU_BOOT_DEV_STS_PLL_INFO_EN = 11,
+	CPU_BOOT_DEV_STS_SP_SRAM_EN = 12,
+	CPU_BOOT_DEV_STS_CLK_GATE_EN = 13,
+	CPU_BOOT_DEV_STS_HBM_ECC_EN = 14,
+	CPU_BOOT_DEV_STS_PKT_PI_ACK_EN = 15,
+	CPU_BOOT_DEV_STS_FW_LD_COM_EN = 16,
+	CPU_BOOT_DEV_STS_FW_IATU_CONF_EN = 17,
+	CPU_BOOT_DEV_STS_FW_NIC_MAC_EN = 18,
+	CPU_BOOT_DEV_STS_DYN_PLL_EN = 19,
+	CPU_BOOT_DEV_STS_GIC_PRIVILEGED_EN = 20,
+	CPU_BOOT_DEV_STS_EQ_INDEX_EN = 21,
+	CPU_BOOT_DEV_STS_MULTI_IRQ_POLL_EN = 22,
+	CPU_BOOT_DEV_STS_FW_NIC_STAT_XPCS91_EN = 23,
+	CPU_BOOT_DEV_STS_FW_NIC_STAT_EXT_EN = 24,
+	CPU_BOOT_DEV_STS_IS_IDLE_CHECK_EN = 25,
+	CPU_BOOT_DEV_STS_MAP_HWMON_EN = 26,
+	CPU_BOOT_DEV_STS_ENABLED = 31,
+	CPU_BOOT_DEV_STS_SCND_EN = 63,
+	CPU_BOOT_DEV_STS_LAST = 64 /* we have 2 registers of 32 bits */
+};
 
 /*
  * BOOT DEVICE STATUS bits in BOOT_DEVICE_STS registers
@@ -233,7 +274,7 @@
  *					was not served before.
  *					Initialized in: linux
  *
- * CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN  Use multiple scratchpad interfaces to
+ * CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN	Use multiple scratchpad interfaces to
  *					prevent IRQs overriding each other.
  *					Initialized in: linux
  *
@@ -266,35 +307,35 @@
  *					Initialized in: preboot
  *
  */
-#define CPU_BOOT_DEV_STS0_SECURITY_EN			(1 << 0)
-#define CPU_BOOT_DEV_STS0_DEBUG_EN			(1 << 1)
-#define CPU_BOOT_DEV_STS0_WATCHDOG_EN			(1 << 2)
-#define CPU_BOOT_DEV_STS0_DRAM_INIT_EN			(1 << 3)
-#define CPU_BOOT_DEV_STS0_BMC_WAIT_EN			(1 << 4)
-#define CPU_BOOT_DEV_STS0_E2E_CRED_EN			(1 << 5)
-#define CPU_BOOT_DEV_STS0_HBM_CRED_EN			(1 << 6)
-#define CPU_BOOT_DEV_STS0_RL_EN				(1 << 7)
-#define CPU_BOOT_DEV_STS0_SRAM_SCR_EN			(1 << 8)
-#define CPU_BOOT_DEV_STS0_DRAM_SCR_EN			(1 << 9)
-#define CPU_BOOT_DEV_STS0_FW_HARD_RST_EN		(1 << 10)
-#define CPU_BOOT_DEV_STS0_PLL_INFO_EN			(1 << 11)
-#define CPU_BOOT_DEV_STS0_SP_SRAM_EN			(1 << 12)
-#define CPU_BOOT_DEV_STS0_CLK_GATE_EN			(1 << 13)
-#define CPU_BOOT_DEV_STS0_HBM_ECC_EN			(1 << 14)
-#define CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN			(1 << 15)
-#define CPU_BOOT_DEV_STS0_FW_LD_COM_EN			(1 << 16)
-#define CPU_BOOT_DEV_STS0_FW_IATU_CONF_EN		(1 << 17)
-#define CPU_BOOT_DEV_STS0_FW_NIC_MAC_EN			(1 << 18)
-#define CPU_BOOT_DEV_STS0_DYN_PLL_EN			(1 << 19)
-#define CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN		(1 << 20)
-#define CPU_BOOT_DEV_STS0_EQ_INDEX_EN			(1 << 21)
-#define CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN		(1 << 22)
-#define CPU_BOOT_DEV_STS0_FW_NIC_STAT_XPCS91_EN		(1 << 23)
-#define CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN		(1 << 24)
-#define CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN		(1 << 25)
-#define CPU_BOOT_DEV_STS0_MAP_HWMON_EN			(1 << 26)
-#define CPU_BOOT_DEV_STS0_ENABLED			(1 << 31)
-#define CPU_BOOT_DEV_STS1_ENABLED			(1 << 31)
+#define CPU_BOOT_DEV_STS0_SECURITY_EN		(1 << CPU_BOOT_DEV_STS_SECURITY_EN)
+#define CPU_BOOT_DEV_STS0_DEBUG_EN		(1 << CPU_BOOT_DEV_STS_DEBUG_EN)
+#define CPU_BOOT_DEV_STS0_WATCHDOG_EN		(1 << CPU_BOOT_DEV_STS_WATCHDOG_EN)
+#define CPU_BOOT_DEV_STS0_DRAM_INIT_EN		(1 << CPU_BOOT_DEV_STS_DRAM_INIT_EN)
+#define CPU_BOOT_DEV_STS0_BMC_WAIT_EN		(1 << CPU_BOOT_DEV_STS_BMC_WAIT_EN)
+#define CPU_BOOT_DEV_STS0_E2E_CRED_EN		(1 << CPU_BOOT_DEV_STS_E2E_CRED_EN)
+#define CPU_BOOT_DEV_STS0_HBM_CRED_EN		(1 << CPU_BOOT_DEV_STS_HBM_CRED_EN)
+#define CPU_BOOT_DEV_STS0_RL_EN			(1 << CPU_BOOT_DEV_STS_RL_EN)
+#define CPU_BOOT_DEV_STS0_SRAM_SCR_EN		(1 << CPU_BOOT_DEV_STS_SRAM_SCR_EN)
+#define CPU_BOOT_DEV_STS0_DRAM_SCR_EN		(1 << CPU_BOOT_DEV_STS_DRAM_SCR_EN)
+#define CPU_BOOT_DEV_STS0_FW_HARD_RST_EN	(1 << CPU_BOOT_DEV_STS_FW_HARD_RST_EN)
+#define CPU_BOOT_DEV_STS0_PLL_INFO_EN		(1 << CPU_BOOT_DEV_STS_PLL_INFO_EN)
+#define CPU_BOOT_DEV_STS0_SP_SRAM_EN		(1 << CPU_BOOT_DEV_STS_SP_SRAM_EN)
+#define CPU_BOOT_DEV_STS0_CLK_GATE_EN		(1 << CPU_BOOT_DEV_STS_CLK_GATE_EN)
+#define CPU_BOOT_DEV_STS0_HBM_ECC_EN		(1 << CPU_BOOT_DEV_STS_HBM_ECC_EN)
+#define CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN		(1 << CPU_BOOT_DEV_STS_PKT_PI_ACK_EN)
+#define CPU_BOOT_DEV_STS0_FW_LD_COM_EN		(1 << CPU_BOOT_DEV_STS_FW_LD_COM_EN)
+#define CPU_BOOT_DEV_STS0_FW_IATU_CONF_EN	(1 << CPU_BOOT_DEV_STS_FW_IATU_CONF_EN)
+#define CPU_BOOT_DEV_STS0_FW_NIC_MAC_EN		(1 << CPU_BOOT_DEV_STS_FW_NIC_MAC_EN)
+#define CPU_BOOT_DEV_STS0_DYN_PLL_EN		(1 << CPU_BOOT_DEV_STS_DYN_PLL_EN)
+#define CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN	(1 << CPU_BOOT_DEV_STS_GIC_PRIVILEGED_EN)
+#define CPU_BOOT_DEV_STS0_EQ_INDEX_EN		(1 << CPU_BOOT_DEV_STS_EQ_INDEX_EN)
+#define CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN	(1 << CPU_BOOT_DEV_STS_MULTI_IRQ_POLL_EN)
+#define CPU_BOOT_DEV_STS0_FW_NIC_STAT_XPCS91_EN	(1 << CPU_BOOT_DEV_STS_FW_NIC_STAT_XPCS91_EN)
+#define CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN	(1 << CPU_BOOT_DEV_STS_FW_NIC_STAT_EXT_EN)
+#define CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN	(1 << CPU_BOOT_DEV_STS_IS_IDLE_CHECK_EN)
+#define CPU_BOOT_DEV_STS0_MAP_HWMON_EN		(1 << CPU_BOOT_DEV_STS_MAP_HWMON_EN)
+#define CPU_BOOT_DEV_STS0_ENABLED		(1 << CPU_BOOT_DEV_STS_ENABLED)
+#define CPU_BOOT_DEV_STS1_ENABLED		(1 << CPU_BOOT_DEV_STS_ENABLED)
 
 enum cpu_boot_status {
 	CPU_BOOT_STATUS_NA = 0,		/* Default value after reset of chip */
@@ -411,6 +452,8 @@ struct cpu_dyn_regs {
 enum comms_msg_type {
 	HL_COMMS_DESC_TYPE = 0,
 	HL_COMMS_RESET_CAUSE_TYPE = 1,
+	HL_COMMS_FW_CFG_SKIP_TYPE = 2,
+	HL_COMMS_BINNING_CONF_TYPE = 3,
 };
 
 /* TODO: remove this struct after the code is updated to use message */
@@ -470,6 +513,9 @@ struct lkd_fw_comms_msg {
 		struct {
 			__u8 reset_cause;
 		};
+		struct {
+			__u8 fw_cfg_skip; /* 1 - skip, 0 - don't skip */
+		};
 	};
 };
 
@@ -513,8 +559,6 @@ struct lkd_fw_comms_msg {
  * COMMS_SKIP_BMC		Perform actions required for BMC-less servers.
  *				Do not wait for BMC response.
  *
- * COMMS_LOW_PLL_OPP		Initialize PLLs for low OPP.
- *
  * COMMS_PREP_DESC_ELBI		Same as COMMS_PREP_DESC only that the memory
  *				space is allocated in a ELBI access only
  *				address range.
@@ -530,7 +574,6 @@ enum comms_cmd {
 	COMMS_RST_DEV = 6,
 	COMMS_GOTO_WFE = 7,
 	COMMS_SKIP_BMC = 8,
-	COMMS_LOW_PLL_OPP = 9,
 	COMMS_PREP_DESC_ELBI = 10,
 	COMMS_INVLD_LAST
 };
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_fw_if.h b/drivers/misc/habanalabs/include/gaudi/gaudi_fw_if.h
index 34ca4fe50d91..2dba02757d37 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_fw_if.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_fw_if.h
@@ -8,8 +8,6 @@
 #ifndef GAUDI_FW_IF_H
 #define GAUDI_FW_IF_H
 
-#include <linux/types.h>
-
 #define GAUDI_EVENT_QUEUE_MSI_IDX	8
 #define GAUDI_NIC_PORT1_MSI_IDX		10
 #define GAUDI_NIC_PORT3_MSI_IDX		12
@@ -78,13 +76,13 @@ struct gaudi_nic_status {
 	__u32 high_ber_cnt;
 };
 
-struct gaudi_flops_2_data {
+struct gaudi_cold_rst_data {
 	union {
 		struct {
-			__u32 spsram_init_done : 1;
-			__u32 reserved : 31;
+			u32 spsram_init_done : 1;
+			u32 reserved : 31;
 		};
-		__u32 data;
+		__le32 data;
 	};
 };
 
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h b/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h
index b9bd5a7f71eb..92f25c2ae083 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h
@@ -33,6 +33,7 @@
 #define mmRDWR_TEST			mmPSOC_GLOBAL_CONF_SCRATCHPAD_30
 #define mmBTL_ID			mmPSOC_GLOBAL_CONF_SCRATCHPAD_31
 #define mmPREBOOT_PCIE_EN		mmPSOC_GLOBAL_CONF_COLD_RST_FLOPS_1
+#define mmCOLD_RST_DATA			mmPSOC_GLOBAL_CONF_COLD_RST_FLOPS_2
 #define mmUPD_PENDING_STS		mmPSOC_GLOBAL_CONF_COLD_RST_FLOPS_3
 
 #endif /* GAUDI_REG_MAP_H_ */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/5] habanalabs: use only u32
  2021-10-10  8:03 [PATCH 1/5] habanalabs: bypass reset for continuous h/w error event Oded Gabbay
  2021-10-10  8:03 ` [PATCH 2/5] habanalabs: update firmware files Oded Gabbay
@ 2021-10-10  8:03 ` Oded Gabbay
  2021-10-10  8:03 ` [PATCH 4/5] habanalabs: fix race condition in multi CS completion Oded Gabbay
  2021-10-10  8:03 ` [PATCH 5/5] habanalabs: fix NULL pointer dereference Oded Gabbay
  3 siblings, 0 replies; 5+ messages in thread
From: Oded Gabbay @ 2021-10-10  8:03 UTC (permalink / raw)
  To: linux-kernel

In the kernel it is common to use u32 and not uint32_t.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 70a668951ec4..654f7959c5ad 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -2616,7 +2616,7 @@ static void gaudi_init_e2e(struct hl_device *hdev)
 
 static void gaudi_init_hbm_cred(struct hl_device *hdev)
 {
-	uint32_t hbm0_wr, hbm1_wr, hbm0_rd, hbm1_rd;
+	u32 hbm0_wr, hbm1_wr, hbm0_rd, hbm1_rd;
 
 	if (hdev->asic_prop.fw_security_enabled)
 		return;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 4/5] habanalabs: fix race condition in multi CS completion
  2021-10-10  8:03 [PATCH 1/5] habanalabs: bypass reset for continuous h/w error event Oded Gabbay
  2021-10-10  8:03 ` [PATCH 2/5] habanalabs: update firmware files Oded Gabbay
  2021-10-10  8:03 ` [PATCH 3/5] habanalabs: use only u32 Oded Gabbay
@ 2021-10-10  8:03 ` Oded Gabbay
  2021-10-10  8:03 ` [PATCH 5/5] habanalabs: fix NULL pointer dereference Oded Gabbay
  3 siblings, 0 replies; 5+ messages in thread
From: Oded Gabbay @ 2021-10-10  8:03 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

Race condition occurs when CS fence completes and multi CS did not
completed yet, while waiting for multi CS ends and returns indication
to user that the CS completed. Next wait for multi CS may be triggered
by previous multi CS completion without any current CS completed,
causing an error.

Example scenario :
1. User do multi CS wait for CSs 1 and 2 on master QID 0

2. CS 1 and 2 reached the "cs release" code. The thread of CS 1
   completed both the CS and multi CS handling but the completion
   thread of CS 2 completed the CS but still did not executed
   complete_multi_cs (note that in CS completion the sequence is to
   first do complete all for the CS and then another complete all to
   signal the multi_cs)

3. User received indication that CS 1 and 2 completed (since we check
   the CS fence and both indicated as completed) and immediately waits
   on CS 3 and 4, also on master QID 0.

4. Completion thread of CS2 executed complete_multi_cs before
   completion of CS 3 and 4 and so will trigger the multi CS wait of
   CSs 3 and 4 as they wait on master QID 0.

This will trigger multi CS completion although none of its
current CS has been completed.

Fixed by adding multi CS complete handling indication for each CS.
CS will be marked to the user as completed only if its fence completed
and multi CS handling is done.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 20 ++++++++++++++++++-
 drivers/misc/habanalabs/common/habanalabs.h   |  4 ++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 4bc24852a283..8e4dc1441fff 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -143,6 +143,7 @@ static void hl_fence_init(struct hl_fence *fence, u64 sequence)
 	fence->cs_sequence = sequence;
 	fence->error = 0;
 	fence->timestamp = ktime_set(0, 0);
+	fence->mcs_handling_done = false;
 	init_completion(&fence->completion);
 }
 
@@ -535,10 +536,21 @@ static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
 				mcs_compl->timestamp =
 						ktime_to_ns(fence->timestamp);
 			complete_all(&mcs_compl->completion);
+
+			/*
+			 * Setting mcs_handling_done inside the lock ensures
+			 * at least one fence have mcs_handling_done set to
+			 * true before wait for mcs finish. This ensures at
+			 * least one CS will be set as completed when polling
+			 * mcs fences.
+			 */
+			fence->mcs_handling_done = true;
 		}
 
 		spin_unlock(&mcs_compl->lock);
 	}
+	/* In case CS completed without mcs completion initialized */
+	fence->mcs_handling_done = true;
 }
 
 static inline void cs_release_sob_reset_handler(struct hl_device *hdev,
@@ -2372,7 +2384,13 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data)
 
 		mcs_data->stream_master_qid_map |= fence->stream_master_qid_map;
 
-		if (status == CS_WAIT_STATUS_BUSY)
+		/*
+		 * Using mcs_handling_done to avoid possibility of mcs_data
+		 * returns to user indicating CS completed before it finished
+		 * all of its mcs handling, to avoid race the next time the
+		 * user waits for mcs.
+		 */
+		if (status == CS_WAIT_STATUS_BUSY || !fence->mcs_handling_done)
 			continue;
 
 		mcs_data->completion_bitmap |= BIT(i);
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index a06135155b57..48f0e52cd5be 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -610,6 +610,9 @@ struct asic_fixed_properties {
  * @error: mark this fence with error
  * @timestamp: timestamp upon completion
  * @take_timestamp: timestamp shall be taken upon completion
+ * @mcs_handling_done: indicates that corresponding command submission has
+ *                     finished msc handling, this does not mean it was part
+ *                     of the mcs
  */
 struct hl_fence {
 	struct completion	completion;
@@ -619,6 +622,7 @@ struct hl_fence {
 	int			error;
 	ktime_t			timestamp;
 	u8			take_timestamp;
+	u8			mcs_handling_done;
 };
 
 /**
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 5/5] habanalabs: fix NULL pointer dereference
  2021-10-10  8:03 [PATCH 1/5] habanalabs: bypass reset for continuous h/w error event Oded Gabbay
                   ` (2 preceding siblings ...)
  2021-10-10  8:03 ` [PATCH 4/5] habanalabs: fix race condition in multi CS completion Oded Gabbay
@ 2021-10-10  8:03 ` Oded Gabbay
  3 siblings, 0 replies; 5+ messages in thread
From: Oded Gabbay @ 2021-10-10  8:03 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

When polling fences for multi CS, it is possible that fence is
no longer exists (its corresponding CS completed and the fence was
deleted) but we still accessing its parameters, causing NULL pointer
dereference.

Fixed by checking if fence exits before accessing its parameters.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_submission.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 8e4dc1441fff..a3358cc3c877 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -2382,7 +2382,15 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data)
 			break;
 		}
 
-		mcs_data->stream_master_qid_map |= fence->stream_master_qid_map;
+		/*
+		 * It is possible to get an old sequence numbers from user
+		 * which related to already completed CSs and their fences
+		 * already gone. In this case, no need to consider its QID for
+		 * mcs completion.
+		 */
+		if (fence)
+			mcs_data->stream_master_qid_map |=
+					fence->stream_master_qid_map;
 
 		/*
 		 * Using mcs_handling_done to avoid possibility of mcs_data
@@ -2390,7 +2398,8 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data)
 		 * all of its mcs handling, to avoid race the next time the
 		 * user waits for mcs.
 		 */
-		if (status == CS_WAIT_STATUS_BUSY || !fence->mcs_handling_done)
+		if (status == CS_WAIT_STATUS_BUSY ||
+				(fence && !fence->mcs_handling_done))
 			continue;
 
 		mcs_data->completion_bitmap |= BIT(i);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-10-10  8:03 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-10  8:03 [PATCH 1/5] habanalabs: bypass reset for continuous h/w error event Oded Gabbay
2021-10-10  8:03 ` [PATCH 2/5] habanalabs: update firmware files Oded Gabbay
2021-10-10  8:03 ` [PATCH 3/5] habanalabs: use only u32 Oded Gabbay
2021-10-10  8:03 ` [PATCH 4/5] habanalabs: fix race condition in multi CS completion Oded Gabbay
2021-10-10  8:03 ` [PATCH 5/5] habanalabs: fix NULL pointer dereference Oded Gabbay

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.