linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] habanalabs/gaudi: correct driver events numbering
@ 2021-06-17 14:40 Oded Gabbay
  2021-06-17 14:40 ` [PATCH 2/3] habanalabs: fix typo Oded Gabbay
  2021-06-17 14:40 ` [PATCH 3/3] debugfs: add skip_reset_on_timeout option Oded Gabbay
  0 siblings, 2 replies; 3+ messages in thread
From: Oded Gabbay @ 2021-06-17 14:40 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Currently driver sends fc interrupt id to FW instead of using
cpu interrupt id. We intend to fix that and keep backward
compatibility by using the same interrupt values.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c            | 12 ++++++++----
 .../include/gaudi/gaudi_async_events.h           |  8 ++++----
 .../include/gaudi/gaudi_async_ids_map_extended.h | 16 ++++++++--------
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 3a66e56d7cd5..33f36da766fc 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -3961,7 +3961,8 @@ static int gaudi_init_cpu_queues(struct hl_device *hdev, u32 cpu_timeout)
 			mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
 			le32_to_cpu(dyn_regs->gic_host_pi_upd_irq);
 
-	WREG32(irq_handler_offset, GAUDI_EVENT_PI_UPDATE);
+	WREG32(irq_handler_offset,
+		gaudi_irq_map_table[GAUDI_EVENT_PI_UPDATE].cpu_id);
 
 	err = hl_poll_timeout(
 		hdev,
@@ -4147,7 +4148,8 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
 				mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
 				le32_to_cpu(dyn_regs->gic_host_halt_irq);
 
-		WREG32(irq_handler_offset, GAUDI_EVENT_HALT_MACHINE);
+		WREG32(irq_handler_offset,
+			gaudi_irq_map_table[GAUDI_EVENT_HALT_MACHINE].cpu_id);
 	} else {
 		if (hdev->asic_prop.hard_reset_done_by_fw)
 			gaudi_ask_hard_reset_without_linux(hdev);
@@ -4600,7 +4602,8 @@ static void gaudi_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
 				mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
 				le32_to_cpu(dyn_regs->gic_host_pi_upd_irq);
 
-		WREG32(irq_handler_offset, GAUDI_EVENT_PI_UPDATE);
+		WREG32(irq_handler_offset,
+			gaudi_irq_map_table[GAUDI_EVENT_PI_UPDATE].cpu_id);
 	}
 }
 
@@ -8989,7 +8992,8 @@ static void gaudi_enable_events_from_fw(struct hl_device *hdev)
 			mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
 			le32_to_cpu(dyn_regs->gic_host_ints_irq);
 
-	WREG32(irq_handler_offset, GAUDI_EVENT_INTS_REGISTER);
+	WREG32(irq_handler_offset,
+		gaudi_irq_map_table[GAUDI_EVENT_INTS_REGISTER].cpu_id);
 }
 
 static int gaudi_map_pll_idx_to_fw_idx(u32 pll_idx)
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h b/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h
index f66c759952e4..2aee18e19b5a 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h
@@ -252,10 +252,6 @@ enum gaudi_async_event_id {
 	GAUDI_EVENT_HBM3_SPI_0 = 407,
 	GAUDI_EVENT_HBM3_SPI_1 = 408,
 	GAUDI_EVENT_PSOC_GPIO_U16_0 = 421,
-	GAUDI_EVENT_PI_UPDATE = 484,
-	GAUDI_EVENT_HALT_MACHINE = 485,
-	GAUDI_EVENT_INTS_REGISTER = 486,
-	GAUDI_EVENT_SOFT_RESET = 487,
 	GAUDI_EVENT_RAZWI_OR_ADC = 548,
 	GAUDI_EVENT_TPC0_QM = 572,
 	GAUDI_EVENT_TPC1_QM = 573,
@@ -303,6 +299,10 @@ enum gaudi_async_event_id {
 	GAUDI_EVENT_NIC3_QP1 = 619,
 	GAUDI_EVENT_NIC4_QP0 = 620,
 	GAUDI_EVENT_NIC4_QP1 = 621,
+	GAUDI_EVENT_PI_UPDATE = 635,
+	GAUDI_EVENT_HALT_MACHINE = 636,
+	GAUDI_EVENT_INTS_REGISTER = 637,
+	GAUDI_EVENT_SOFT_RESET = 638,
 	GAUDI_EVENT_FW_ALIVE_S = 645,
 	GAUDI_EVENT_DEV_RESET_REQ = 646,
 	GAUDI_EVENT_PKT_QUEUE_OUT_SYNC = 647,
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_async_ids_map_extended.h b/drivers/misc/habanalabs/include/gaudi/gaudi_async_ids_map_extended.h
index e87554ab0102..ac4d4b51da7f 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_async_ids_map_extended.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_async_ids_map_extended.h
@@ -508,10 +508,10 @@ static struct gaudi_async_events_ids_map gaudi_irq_map_table[] = {
 	{ .fc_id = 481, .cpu_id = 330, .valid = 0, .name = "" },
 	{ .fc_id = 482, .cpu_id = 331, .valid = 0, .name = "" },
 	{ .fc_id = 483, .cpu_id = 332, .valid = 0, .name = "" },
-	{ .fc_id = 484, .cpu_id = 333, .valid = 1, .name = "PI_UPDATE" },
-	{ .fc_id = 485, .cpu_id = 334, .valid = 1, .name = "HALT_MACHINE" },
-	{ .fc_id = 486, .cpu_id = 335, .valid = 1, .name = "INTS_REGISTER" },
-	{ .fc_id = 487, .cpu_id = 336, .valid = 1, .name = "SOFT_RESET" },
+	{ .fc_id = 484, .cpu_id = 333, .valid = 0, .name = "" },
+	{ .fc_id = 485, .cpu_id = 334, .valid = 0, .name = "" },
+	{ .fc_id = 486, .cpu_id = 335, .valid = 0, .name = "" },
+	{ .fc_id = 487, .cpu_id = 336, .valid = 0, .name = "" },
 	{ .fc_id = 488, .cpu_id = 337, .valid = 0, .name = "" },
 	{ .fc_id = 489, .cpu_id = 338, .valid = 0, .name = "" },
 	{ .fc_id = 490, .cpu_id = 339, .valid = 0, .name = "" },
@@ -659,10 +659,10 @@ static struct gaudi_async_events_ids_map gaudi_irq_map_table[] = {
 	{ .fc_id = 632, .cpu_id = 481, .valid = 0, .name = "" },
 	{ .fc_id = 633, .cpu_id = 482, .valid = 0, .name = "" },
 	{ .fc_id = 634, .cpu_id = 483, .valid = 0, .name = "" },
-	{ .fc_id = 635, .cpu_id = 484, .valid = 0, .name = "" },
-	{ .fc_id = 636, .cpu_id = 485, .valid = 0, .name = "" },
-	{ .fc_id = 637, .cpu_id = 486, .valid = 0, .name = "" },
-	{ .fc_id = 638, .cpu_id = 487, .valid = 0, .name = "" },
+	{ .fc_id = 635, .cpu_id = 484, .valid = 1, .name = "PI_UPDATE" },
+	{ .fc_id = 636, .cpu_id = 485, .valid = 1, .name = "HALT_MACHINE" },
+	{ .fc_id = 637, .cpu_id = 486, .valid = 1, .name = "INTS_REGISTER" },
+	{ .fc_id = 638, .cpu_id = 487, .valid = 1, .name = "SOFT_RESET" },
 	{ .fc_id = 639, .cpu_id = 488, .valid = 0, .name = "" },
 	{ .fc_id = 640, .cpu_id = 489, .valid = 0, .name = "" },
 	{ .fc_id = 641, .cpu_id = 490, .valid = 0, .name = "" },
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/3] habanalabs: fix typo
  2021-06-17 14:40 [PATCH 1/3] habanalabs/gaudi: correct driver events numbering Oded Gabbay
@ 2021-06-17 14:40 ` Oded Gabbay
  2021-06-17 14:40 ` [PATCH 3/3] debugfs: add skip_reset_on_timeout option Oded Gabbay
  1 sibling, 0 replies; 3+ messages in thread
From: Oded Gabbay @ 2021-06-17 14:40 UTC (permalink / raw)
  To: linux-kernel; +Cc: Zvika Yehudai

From: Zvika Yehudai <zyehudai@habana.ai>

fix a type in comment

Signed-off-by: Zvika Yehudai <zyehudai@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index e545574b1d38..f8b7080e0570 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2119,7 +2119,7 @@ struct hl_mmu_funcs {
  * @kernel_queues: array of hl_hw_queue.
  * @cs_mirror_list: CS mirror list for TDR.
  * @cs_mirror_lock: protects cs_mirror_list.
- * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs.
+ * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CBs.
  * @event_queue: event queue for IRQ from CPU-CP.
  * @dma_pool: DMA pool for small allocations.
  * @cpu_accessible_dma_mem: Host <-> CPU-CP shared memory CPU address.
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 3/3] debugfs: add skip_reset_on_timeout option
  2021-06-17 14:40 [PATCH 1/3] habanalabs/gaudi: correct driver events numbering Oded Gabbay
  2021-06-17 14:40 ` [PATCH 2/3] habanalabs: fix typo Oded Gabbay
@ 2021-06-17 14:40 ` Oded Gabbay
  1 sibling, 0 replies; 3+ messages in thread
From: Oded Gabbay @ 2021-06-17 14:40 UTC (permalink / raw)
  To: linux-kernel; +Cc: Yuri Nudelman

From: Yuri Nudelman <ynudelman@habana.ai>

To be able to debug long-running CS better, without changing the
userspace code, we are adding a new option through debugfs interface
to skip the reset of the device in case of CS timeout.

Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 Documentation/ABI/testing/debugfs-driver-habanalabs | 8 ++++++++
 drivers/misc/habanalabs/common/command_submission.c | 1 +
 drivers/misc/habanalabs/common/debugfs.c            | 5 +++++
 drivers/misc/habanalabs/common/habanalabs.h         | 3 +++
 4 files changed, 17 insertions(+)

diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs
index c78fc9282876..e78ceb1f70b3 100644
--- a/Documentation/ABI/testing/debugfs-driver-habanalabs
+++ b/Documentation/ABI/testing/debugfs-driver-habanalabs
@@ -207,6 +207,14 @@ Contact:        ogabbay@kernel.org
 Description:    Sets the PCI power state. Valid values are "1" for D0 and "2"
                 for D3Hot
 
+What:           /sys/kernel/debug/habanalabs/hl<n>/skip_reset_on_timeout
+Date:           Jun 2021
+KernelVersion:  5.13
+Contact:        ynudelman@habana.ai
+Description:    Sets the skip reset on timeout option for the device. Value of
+                "0" means device will be reset in case some CS has timed out,
+                otherwise it will not be reset.
+
 What:           /sys/kernel/debug/habanalabs/hl<n>/stop_on_err
 Date:           Mar 2020
 KernelVersion:  5.6
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 6d51f54030c1..adedb288d452 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -663,6 +663,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP);
 	cs->timeout_jiffies = timeout;
 	cs->skip_reset_on_timeout =
+		hdev->skip_reset_on_timeout ||
 		!!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT);
 	cs->submission_time_jiffies = jiffies;
 	INIT_LIST_HEAD(&cs->job_list);
diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 8381155578a0..703d79fb6f3f 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -1278,6 +1278,11 @@ void hl_debugfs_add_device(struct hl_device *hdev)
 				dev_entry->root,
 				&dev_entry->blob_desc);
 
+	debugfs_create_x8("skip_reset_on_timeout",
+				0644,
+				dev_entry->root,
+				&hdev->skip_reset_on_timeout);
+
 	for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
 		debugfs_create_file(hl_debugfs_list[i].name,
 					0444,
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index f8b7080e0570..12d9dc42e05e 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2222,6 +2222,8 @@ struct hl_mmu_funcs {
  * @supports_staged_submission: true if staged submissions are supported
  * @curr_reset_cause: saves an enumerated reset cause when a hard reset is
  *                    triggered, and cleared after it is shared with preboot.
+ * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to
+ *                         complete instead.
  */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -2337,6 +2339,7 @@ struct hl_device {
 	u8				device_fini_pending;
 	u8				supports_staged_submission;
 	u8				curr_reset_cause;
+	u8				skip_reset_on_timeout;
 
 	/* Parameters for bring-up */
 	u64				nic_ports_mask;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-06-17 14:40 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-17 14:40 [PATCH 1/3] habanalabs/gaudi: correct driver events numbering Oded Gabbay
2021-06-17 14:40 ` [PATCH 2/3] habanalabs: fix typo Oded Gabbay
2021-06-17 14:40 ` [PATCH 3/3] debugfs: add skip_reset_on_timeout option Oded Gabbay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).