dri-devel.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/7] accel/habanalabs: print raw binning masks in debug level
@ 2023-03-30  7:22 Oded Gabbay
  2023-03-30  7:22 ` [PATCH 2/7] accel/habanalabs: remove completion from abnormal interrupt work name Oded Gabbay
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: Oded Gabbay @ 2023-03-30  7:22 UTC (permalink / raw)
  To: dri-devel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

There are rare cases of failures when cards are initialized due to
wrong values in efuse mappings that are parsed by firmware.

To help debug those cases, print (in debug level) the raw binning masks
as fetched from the firmware during device initialization.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index ad491fb2c39d..ea9fdc616de4 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2888,6 +2888,10 @@ static int gaudi2_cpucp_info_get(struct hl_device *hdev)
 	hdev->tpc_binning = le64_to_cpu(prop->cpucp_info.tpc_binning_mask);
 	hdev->decoder_binning = lower_32_bits(le64_to_cpu(prop->cpucp_info.decoder_binning_mask));
 
+	dev_dbg(hdev->dev, "Read binning masks: tpc: 0x%llx, dram: 0x%llx, edma: 0x%x, dec: 0x%x\n",
+			hdev->tpc_binning, hdev->dram_binning, hdev->edma_binning,
+			hdev->decoder_binning);
+
 	/*
 	 * at this point the DRAM parameters need to be updated according to data obtained
 	 * from the FW
-- 
2.40.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/7] accel/habanalabs: remove completion from abnormal interrupt work name
  2023-03-30  7:22 [PATCH 1/7] accel/habanalabs: print raw binning masks in debug level Oded Gabbay
@ 2023-03-30  7:22 ` Oded Gabbay
  2023-03-30  7:22 ` [PATCH 3/7] accel/habanalabs: fix events mask of decoder abnormal interrupts Oded Gabbay
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2023-03-30  7:22 UTC (permalink / raw)
  To: dri-devel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

Decoder abnormal interrupts are for errors and not for completion, so
rename the relevant work and work function to not include 'completion'.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/decoder.c    | 22 +++++++-------------
 drivers/accel/habanalabs/common/habanalabs.h | 10 ++++-----
 drivers/accel/habanalabs/common/irq.c        |  2 +-
 3 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/drivers/accel/habanalabs/common/decoder.c b/drivers/accel/habanalabs/common/decoder.c
index 69c78c1784b4..59a1ecb20c04 100644
--- a/drivers/accel/habanalabs/common/decoder.c
+++ b/drivers/accel/habanalabs/common/decoder.c
@@ -43,22 +43,24 @@ static void dec_print_abnrm_intr_source(struct hl_device *hdev, u32 irq_status)
 		intr_source[2], intr_source[3], intr_source[4], intr_source[5]);
 }
 
-static void dec_error_intr_work(struct hl_device *hdev, u32 base_addr, u32 core_id)
+static void dec_abnrm_intr_work(struct work_struct *work)
 {
+	struct hl_dec *dec = container_of(work, struct hl_dec, abnrm_intr_work);
+	struct hl_device *hdev = dec->hdev;
 	bool reset_required = false;
 	u32 irq_status, event_mask;
 
-	irq_status = RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET);
+	irq_status = RREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET);
 
-	dev_err(hdev->dev, "Decoder abnormal interrupt %#x, core %d\n", irq_status, core_id);
+	dev_err(hdev->dev, "Decoder abnormal interrupt %#x, core %d\n", irq_status, dec->core_id);
 
 	dec_print_abnrm_intr_source(hdev, irq_status);
 
 	/* Clear the interrupt */
-	WREG32(base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status);
+	WREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status);
 
 	/* Flush the interrupt clear */
-	RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET);
+	RREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET);
 
 	if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK) {
 		reset_required = true;
@@ -77,14 +79,6 @@ static void dec_error_intr_work(struct hl_device *hdev, u32 base_addr, u32 core_
 	}
 }
 
-static void dec_completion_abnrm(struct work_struct *work)
-{
-	struct hl_dec *dec = container_of(work, struct hl_dec, completion_abnrm_work);
-	struct hl_device *hdev = dec->hdev;
-
-	dec_error_intr_work(hdev, dec->base_addr, dec->core_id);
-}
-
 void hl_dec_fini(struct hl_device *hdev)
 {
 	kfree(hdev->dec);
@@ -108,7 +102,7 @@ int hl_dec_init(struct hl_device *hdev)
 		dec = hdev->dec + j;
 
 		dec->hdev = hdev;
-		INIT_WORK(&dec->completion_abnrm_work, dec_completion_abnrm);
+		INIT_WORK(&dec->abnrm_intr_work, dec_abnrm_intr_work);
 		dec->core_id = j;
 		dec->base_addr = hdev->asic_funcs->get_dec_base_addr(hdev, j);
 		if (!dec->base_addr) {
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index a6f5c2152b0a..7b6ad3d7dbaa 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -1211,15 +1211,15 @@ struct hl_eq {
 /**
  * struct hl_dec - describes a decoder sw instance.
  * @hdev: pointer to the device structure.
- * @completion_abnrm_work: workqueue object to run when decoder generates an error interrupt
+ * @abnrm_intr_work: workqueue work item to run when decoder generates an error interrupt.
  * @core_id: ID of the decoder.
  * @base_addr: base address of the decoder.
  */
 struct hl_dec {
-	struct hl_device		*hdev;
-	struct work_struct		completion_abnrm_work;
-	u32				core_id;
-	u32				base_addr;
+	struct hl_device	*hdev;
+	struct work_struct	abnrm_intr_work;
+	u32			core_id;
+	u32			base_addr;
 };
 
 /**
diff --git a/drivers/accel/habanalabs/common/irq.c b/drivers/accel/habanalabs/common/irq.c
index 0d59bb7c9063..c67895b1cdeb 100644
--- a/drivers/accel/habanalabs/common/irq.c
+++ b/drivers/accel/habanalabs/common/irq.c
@@ -489,7 +489,7 @@ irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg)
 {
 	struct hl_dec *dec = arg;
 
-	schedule_work(&dec->completion_abnrm_work);
+	schedule_work(&dec->abnrm_intr_work);
 
 	return IRQ_HANDLED;
 }
-- 
2.40.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 3/7] accel/habanalabs: fix events mask of decoder abnormal interrupts
  2023-03-30  7:22 [PATCH 1/7] accel/habanalabs: print raw binning masks in debug level Oded Gabbay
  2023-03-30  7:22 ` [PATCH 2/7] accel/habanalabs: remove completion from abnormal interrupt work name Oded Gabbay
@ 2023-03-30  7:22 ` Oded Gabbay
  2023-03-30  7:22 ` [PATCH 4/7] accel/habanalabs: fix wrong reset and event flags Oded Gabbay
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2023-03-30  7:22 UTC (permalink / raw)
  To: dri-devel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

The decoder IRQ status register may have several set bits upon an
abnormal interrupt. Therefore, when setting the events mask, need to
check all bits and not using if-else.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/decoder.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/accel/habanalabs/common/decoder.c b/drivers/accel/habanalabs/common/decoder.c
index 59a1ecb20c04..c03a6da45d00 100644
--- a/drivers/accel/habanalabs/common/decoder.c
+++ b/drivers/accel/habanalabs/common/decoder.c
@@ -47,8 +47,8 @@ static void dec_abnrm_intr_work(struct work_struct *work)
 {
 	struct hl_dec *dec = container_of(work, struct hl_dec, abnrm_intr_work);
 	struct hl_device *hdev = dec->hdev;
+	u32 irq_status, event_mask = 0;
 	bool reset_required = false;
-	u32 irq_status, event_mask;
 
 	irq_status = RREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET);
 
@@ -64,17 +64,21 @@ static void dec_abnrm_intr_work(struct work_struct *work)
 
 	if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK) {
 		reset_required = true;
-		event_mask = HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
-	} else if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK) {
-		event_mask = HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
-	} else {
-		event_mask = HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
+		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
 	}
 
+	if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK)
+		event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
+
+	if (irq_status & (VCMD_IRQ_STATUS_ENDCMD_MASK |
+				VCMD_IRQ_STATUS_BUSERR_MASK |
+				VCMD_IRQ_STATUS_ABORT_MASK))
+		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
+
 	if (reset_required) {
 		event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
 		hl_device_cond_reset(hdev, 0, event_mask);
-	} else {
+	} else if (event_mask) {
 		hl_notifier_event_send_all(hdev, event_mask);
 	}
 }
-- 
2.40.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 4/7] accel/habanalabs: fix wrong reset and event flags
  2023-03-30  7:22 [PATCH 1/7] accel/habanalabs: print raw binning masks in debug level Oded Gabbay
  2023-03-30  7:22 ` [PATCH 2/7] accel/habanalabs: remove completion from abnormal interrupt work name Oded Gabbay
  2023-03-30  7:22 ` [PATCH 3/7] accel/habanalabs: fix events mask of decoder abnormal interrupts Oded Gabbay
@ 2023-03-30  7:22 ` Oded Gabbay
  2023-03-30  7:22 ` [PATCH 5/7] accel/habanalabs: sync f/w events interrupt in hard reset Oded Gabbay
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2023-03-30  7:22 UTC (permalink / raw)
  To: dri-devel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

During event handling, driver sets relevant reset and user event
notifier flags. Fix few wrong flags settings.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index ea9fdc616de4..ce85308d03e9 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -9510,19 +9510,18 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		break;
 
 	case GAUDI2_EVENT_ARC_AXI_ERROR_RESPONSE_0:
-		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
 		error_count = gaudi2_handle_arc_farm_sei_err(hdev, event_type);
-		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
+		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
 	case GAUDI2_EVENT_CPU_AXI_ERR_RSP:
 		error_count = gaudi2_handle_cpu_sei_err(hdev, event_type);
-		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
+		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
+		event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR;
 		break;
 
 	case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP:
 	case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP:
-		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
 		error_count = gaudi2_handle_qm_sei_err(hdev, event_type, true, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
@@ -9709,12 +9708,14 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 
 	case GAUDI2_EVENT_PCIE_DRAIN_COMPLETE:
 		error_count = gaudi2_handle_pcie_drain(hdev, &eq_entry->pcie_drain_ind_data);
+		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
 		break;
 
 	case GAUDI2_EVENT_PSOC59_RPM_ERROR_OR_DRAIN:
 		error_count = gaudi2_handle_psoc_drain(hdev,
 				le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
+		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
 		break;
 
@@ -9743,6 +9744,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		break;
 	case GAUDI2_EVENT_PSOC_AXI_ERR_RSP:
 		error_count = GAUDI2_NA_EVENT_CAUSE;
+		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
 		break;
 	case GAUDI2_EVENT_PSOC_PRSTN_FALL:
@@ -9756,6 +9758,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		break;
 	case GAUDI2_EVENT_PCIE_FATAL_ERR:
 		error_count = GAUDI2_NA_EVENT_CAUSE;
+		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
 		break;
 	case GAUDI2_EVENT_TPC0_BMON_SPMU:
@@ -9823,6 +9826,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 	case GAUDI2_EVENT_CPU_PKT_QUEUE_OUT_SYNC:
 		gaudi2_print_out_of_sync_info(hdev, event_type, &eq_entry->pkt_sync_err);
 		error_count = GAUDI2_NA_EVENT_CAUSE;
+		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
 		break;
 
@@ -9864,6 +9868,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 	case GAUDI2_EVENT_CPU_PKT_SANITY_FAILED:
 		gaudi2_print_cpu_pkt_failure_info(hdev, event_type, &eq_entry->pkt_sync_err);
 		error_count = GAUDI2_NA_EVENT_CAUSE;
+		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
 		break;
 
-- 
2.40.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 5/7] accel/habanalabs: sync f/w events interrupt in hard reset
  2023-03-30  7:22 [PATCH 1/7] accel/habanalabs: print raw binning masks in debug level Oded Gabbay
                   ` (2 preceding siblings ...)
  2023-03-30  7:22 ` [PATCH 4/7] accel/habanalabs: fix wrong reset and event flags Oded Gabbay
@ 2023-03-30  7:22 ` Oded Gabbay
  2023-03-30  7:22 ` [PATCH 6/7] accel/habanalabs: don't wait for STS_OK after sending COMMS WFE Oded Gabbay
  2023-03-30  7:22 ` [PATCH 7/7] accel/habanalabs: fixes for unexpected error interrupt Oded Gabbay
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2023-03-30  7:22 UTC (permalink / raw)
  To: dri-devel; +Cc: Tal Cohen

From: Tal Cohen <talcohen@habana.ai>

Receiving events from FW, while the device is in hard reset, causes
a warning message in Driver log. The message may point to a
problem in the Driver or FW. But It also can appear as a result
of events that have been sent from FW just before the hard reset.
In order to avoid receiving events from FW while the device is in reset
and is already in 'disabled' mode, sync the f/w events interrupt right
before setting the device to 'disabled'.

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c     | 55 +++++++++++---------
 drivers/accel/habanalabs/common/habanalabs.h |  2 +
 drivers/accel/habanalabs/gaudi/gaudi.c       |  3 ++
 drivers/accel/habanalabs/gaudi2/gaudi2.c     |  1 +
 drivers/accel/habanalabs/goya/goya.c         |  1 +
 5 files changed, 37 insertions(+), 25 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 3c1af9d43b65..fabfc501ef54 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1380,13 +1380,41 @@ static void device_disable_open_processes(struct hl_device *hdev, bool control_d
 	mutex_unlock(fd_lock);
 }
 
+static void send_disable_pci_access(struct hl_device *hdev, u32 flags)
+{
+	/* If reset is due to heartbeat, device CPU is no responsive in
+	 * which case no point sending PCI disable message to it.
+	 */
+	if ((flags & HL_DRV_RESET_HARD) &&
+			!(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
+		/* Disable PCI access from device F/W so he won't send
+		 * us additional interrupts. We disable MSI/MSI-X at
+		 * the halt_engines function and we can't have the F/W
+		 * sending us interrupts after that. We need to disable
+		 * the access here because if the device is marked
+		 * disable, the message won't be send. Also, in case
+		 * of heartbeat, the device CPU is marked as disable
+		 * so this message won't be sent
+		 */
+		if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) {
+			dev_warn(hdev->dev, "Failed to disable FW's PCI access\n");
+			return;
+		}
+
+		/* verify that last EQs are handled before disabled is set */
+		if (hdev->cpu_queues_enable)
+			synchronize_irq(pci_irq_vector(hdev->pdev,
+					hdev->asic_prop.eq_interrupt_id));
+	}
+}
+
 static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
 {
 	u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
 
 	/* No consecutive mechanism when user context exists */
 	if (hdev->is_compute_ctx_active)
-		goto disable_pci;
+		return;
 
 	/*
 	 * 'reset cause' is being updated here, because getting here
@@ -1418,30 +1446,6 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
 	} else {
 		hdev->reset_info.reset_trigger_repeated = 1;
 	}
-
-	/* If reset is due to heartbeat, device CPU is no responsive in
-	 * which case no point sending PCI disable message to it.
-	 *
-	 * If F/W is performing the reset, no need to send it a message to disable
-	 * PCI access
-	 */
-
-disable_pci:
-	if ((flags & HL_DRV_RESET_HARD) &&
-			!(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
-		/* Disable PCI access from device F/W so he won't send
-		 * us additional interrupts. We disable MSI/MSI-X at
-		 * the halt_engines function and we can't have the F/W
-		 * sending us interrupts after that. We need to disable
-		 * the access here because if the device is marked
-		 * disable, the message won't be send. Also, in case
-		 * of heartbeat, the device CPU is marked as disable
-		 * so this message won't be sent
-		 */
-		if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0))
-			dev_warn(hdev->dev,
-				"Failed to disable FW's PCI access\n");
-	}
 }
 
 /*
@@ -1562,6 +1566,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 
 escalate_reset_flow:
 		handle_reset_trigger(hdev, flags);
+		send_disable_pci_access(hdev, flags);
 
 		/* This also blocks future CS/VM/JOB completion operations */
 		hdev->disabled = true;
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 7b6ad3d7dbaa..8c3bcc50e560 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -662,6 +662,7 @@ struct hl_hints_range {
  * @user_interrupt_count: number of user interrupts.
  * @user_dec_intr_count: number of decoder interrupts exposed to user.
  * @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host.
+ * @eq_interrupt_id: interrupt id for EQ, uses to synchronize EQ interrupts in hard-reset.
  * @unexpected_user_error_interrupt_id: interrupt id used to indicate an unexpected user error.
  * @cache_line_size: device cache line size.
  * @server_type: Server type that the ASIC is currently installed in.
@@ -793,6 +794,7 @@ struct asic_fixed_properties {
 	u16				user_interrupt_count;
 	u16				user_dec_intr_count;
 	u16				tpc_interrupt_id;
+	u16				eq_interrupt_id;
 	u16				unexpected_user_error_interrupt_id;
 	u16				cache_line_size;
 	u16				server_type;
diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index 08a4b1cf2b42..2ad8e4efce7f 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -682,6 +682,9 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 	prop->first_available_user_interrupt = USHRT_MAX;
 	prop->tpc_interrupt_id = USHRT_MAX;
 
+	/* single msi */
+	prop->eq_interrupt_id = 0;
+
 	for (i = 0 ; i < HL_MAX_DCORES ; i++)
 		prop->first_available_cq[i] = USHRT_MAX;
 
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index ce85308d03e9..554020026da8 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2439,6 +2439,7 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 
 	prop->first_available_user_interrupt = GAUDI2_IRQ_NUM_USER_FIRST;
 	prop->tpc_interrupt_id = GAUDI2_IRQ_NUM_TPC_ASSERT;
+	prop->eq_interrupt_id = GAUDI2_IRQ_NUM_EVENT_QUEUE;
 	prop->unexpected_user_error_interrupt_id = GAUDI2_IRQ_NUM_UNEXPECTED_ERROR;
 
 	prop->first_available_cq[0] = GAUDI2_RESERVED_CQ_NUMBER;
diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c
index 07d67878eac5..fb0ac9df841a 100644
--- a/drivers/accel/habanalabs/goya/goya.c
+++ b/drivers/accel/habanalabs/goya/goya.c
@@ -473,6 +473,7 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 
 	prop->first_available_user_interrupt = USHRT_MAX;
 	prop->tpc_interrupt_id = USHRT_MAX;
+	prop->eq_interrupt_id = GOYA_EVENT_QUEUE_MSIX_IDX;
 
 	for (i = 0 ; i < HL_MAX_DCORES ; i++)
 		prop->first_available_cq[i] = USHRT_MAX;
-- 
2.40.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 6/7] accel/habanalabs: don't wait for STS_OK after sending COMMS WFE
  2023-03-30  7:22 [PATCH 1/7] accel/habanalabs: print raw binning masks in debug level Oded Gabbay
                   ` (3 preceding siblings ...)
  2023-03-30  7:22 ` [PATCH 5/7] accel/habanalabs: sync f/w events interrupt in hard reset Oded Gabbay
@ 2023-03-30  7:22 ` Oded Gabbay
  2023-03-30  7:22 ` [PATCH 7/7] accel/habanalabs: fixes for unexpected error interrupt Oded Gabbay
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2023-03-30  7:22 UTC (permalink / raw)
  To: dri-devel; +Cc: Koby Elbaz

From: Koby Elbaz <kelbaz@habana.ai>

Sending COMMS_GOTO_WFE instructs the FW's CPU to halt (WFE state).
Once sent, FW's CPU isn't expected to continue communicating with LKD.
Therefore, the stage of waiting for COMMS_STS_OK should be skipped or
else waiting for COMMS_STS_OK will simply timeout, which will trigger
unexpected behavior.

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/firmware_if.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 781256dd49ad..59f61ec66445 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -1278,7 +1278,7 @@ void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev)
 	/* Stop device CPU to make sure nothing bad happens */
 	if (hdev->asic_prop.dynamic_fw_load) {
 		rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader,
-				COMMS_GOTO_WFE, 0, true,
+				COMMS_GOTO_WFE, 0, false,
 				hdev->fw_loader.cpu_timeout);
 		if (rc)
 			dev_err(hdev->dev, "Failed sending COMMS_GOTO_WFE\n");
-- 
2.40.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 7/7] accel/habanalabs: fixes for unexpected error interrupt
  2023-03-30  7:22 [PATCH 1/7] accel/habanalabs: print raw binning masks in debug level Oded Gabbay
                   ` (4 preceding siblings ...)
  2023-03-30  7:22 ` [PATCH 6/7] accel/habanalabs: don't wait for STS_OK after sending COMMS WFE Oded Gabbay
@ 2023-03-30  7:22 ` Oded Gabbay
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2023-03-30  7:22 UTC (permalink / raw)
  To: dri-devel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Removing redundant asic prop variable as we don't need to expose this
to common code. In addition, fix some typos.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/habanalabs.h | 2 --
 drivers/accel/habanalabs/gaudi2/gaudi2.c     | 5 ++---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 8c3bcc50e560..eaae69a9f817 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -663,7 +663,6 @@ struct hl_hints_range {
  * @user_dec_intr_count: number of decoder interrupts exposed to user.
  * @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host.
  * @eq_interrupt_id: interrupt id for EQ, uses to synchronize EQ interrupts in hard-reset.
- * @unexpected_user_error_interrupt_id: interrupt id used to indicate an unexpected user error.
  * @cache_line_size: device cache line size.
  * @server_type: Server type that the ASIC is currently installed in.
  *               The value is according to enum hl_server_type in uapi file.
@@ -795,7 +794,6 @@ struct asic_fixed_properties {
 	u16				user_dec_intr_count;
 	u16				tpc_interrupt_id;
 	u16				eq_interrupt_id;
-	u16				unexpected_user_error_interrupt_id;
 	u16				cache_line_size;
 	u16				server_type;
 	u8				completion_queues_count;
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 554020026da8..da1b2e6dd683 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2440,7 +2440,6 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 	prop->first_available_user_interrupt = GAUDI2_IRQ_NUM_USER_FIRST;
 	prop->tpc_interrupt_id = GAUDI2_IRQ_NUM_TPC_ASSERT;
 	prop->eq_interrupt_id = GAUDI2_IRQ_NUM_EVENT_QUEUE;
-	prop->unexpected_user_error_interrupt_id = GAUDI2_IRQ_NUM_UNEXPECTED_ERROR;
 
 	prop->first_available_cq[0] = GAUDI2_RESERVED_CQ_NUMBER;
 
@@ -3351,7 +3350,7 @@ static void gaudi2_user_interrupt_setup(struct hl_device *hdev)
 	/* Initialize TPC interrupt */
 	HL_USR_INTR_STRUCT_INIT(hdev->tpc_interrupt, hdev, 0, HL_USR_INTERRUPT_TPC);
 
-	/* Initialize general purpose interrupt */
+	/* Initialize unexpected error interrupt */
 	HL_USR_INTR_STRUCT_INIT(hdev->unexpected_error_interrupt, hdev, 0,
 						HL_USR_INTERRUPT_UNEXPECTED);
 
@@ -4015,7 +4014,7 @@ static const char *gaudi2_irq_name(u16 irq_number)
 	case GAUDI2_IRQ_NUM_TPC_ASSERT:
 		return "gaudi2 tpc assert";
 	case GAUDI2_IRQ_NUM_UNEXPECTED_ERROR:
-		return "gaudi2 tpc assert";
+		return "gaudi2 unexpected error";
 	case GAUDI2_IRQ_NUM_USER_FIRST ... GAUDI2_IRQ_NUM_USER_LAST:
 		return "gaudi2 user completion";
 	default:
-- 
2.40.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2023-03-30  7:22 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-30  7:22 [PATCH 1/7] accel/habanalabs: print raw binning masks in debug level Oded Gabbay
2023-03-30  7:22 ` [PATCH 2/7] accel/habanalabs: remove completion from abnormal interrupt work name Oded Gabbay
2023-03-30  7:22 ` [PATCH 3/7] accel/habanalabs: fix events mask of decoder abnormal interrupts Oded Gabbay
2023-03-30  7:22 ` [PATCH 4/7] accel/habanalabs: fix wrong reset and event flags Oded Gabbay
2023-03-30  7:22 ` [PATCH 5/7] accel/habanalabs: sync f/w events interrupt in hard reset Oded Gabbay
2023-03-30  7:22 ` [PATCH 6/7] accel/habanalabs: don't wait for STS_OK after sending COMMS WFE Oded Gabbay
2023-03-30  7:22 ` [PATCH 7/7] accel/habanalabs: fixes for unexpected error interrupt Oded Gabbay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).