All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] habanalabs/gaudi: correct driver events numbering
@ 2021-06-17 14:40 Oded Gabbay
  2021-06-17 14:40 ` [PATCH 2/3] habanalabs: fix typo Oded Gabbay
  2021-06-17 14:40 ` [PATCH 3/3] debugfs: add skip_reset_on_timeout option Oded Gabbay
  0 siblings, 2 replies; 3+ messages in thread
From: Oded Gabbay @ 2021-06-17 14:40 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Currently driver sends fc interrupt id to FW instead of using
cpu interrupt id. We intend to fix that and keep backward
compatibility by using the same interrupt values.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c            | 12 ++++++++----
 .../include/gaudi/gaudi_async_events.h           |  8 ++++----
 .../include/gaudi/gaudi_async_ids_map_extended.h | 16 ++++++++--------
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 3a66e56d7cd5..33f36da766fc 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -3961,7 +3961,8 @@ static int gaudi_init_cpu_queues(struct hl_device *hdev, u32 cpu_timeout)
 			mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
 			le32_to_cpu(dyn_regs->gic_host_pi_upd_irq);
 
-	WREG32(irq_handler_offset, GAUDI_EVENT_PI_UPDATE);
+	WREG32(irq_handler_offset,
+		gaudi_irq_map_table[GAUDI_EVENT_PI_UPDATE].cpu_id);
 
 	err = hl_poll_timeout(
 		hdev,
@@ -4147,7 +4148,8 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
 				mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
 				le32_to_cpu(dyn_regs->gic_host_halt_irq);
 
-		WREG32(irq_handler_offset, GAUDI_EVENT_HALT_MACHINE);
+		WREG32(irq_handler_offset,
+			gaudi_irq_map_table[GAUDI_EVENT_HALT_MACHINE].cpu_id);
 	} else {
 		if (hdev->asic_prop.hard_reset_done_by_fw)
 			gaudi_ask_hard_reset_without_linux(hdev);
@@ -4600,7 +4602,8 @@ static void gaudi_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
 				mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
 				le32_to_cpu(dyn_regs->gic_host_pi_upd_irq);
 
-		WREG32(irq_handler_offset, GAUDI_EVENT_PI_UPDATE);
+		WREG32(irq_handler_offset,
+			gaudi_irq_map_table[GAUDI_EVENT_PI_UPDATE].cpu_id);
 	}
 }
 
@@ -8989,7 +8992,8 @@ static void gaudi_enable_events_from_fw(struct hl_device *hdev)
 			mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
 			le32_to_cpu(dyn_regs->gic_host_ints_irq);
 
-	WREG32(irq_handler_offset, GAUDI_EVENT_INTS_REGISTER);
+	WREG32(irq_handler_offset,
+		gaudi_irq_map_table[GAUDI_EVENT_INTS_REGISTER].cpu_id);
 }
 
 static int gaudi_map_pll_idx_to_fw_idx(u32 pll_idx)
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h b/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h
index f66c759952e4..2aee18e19b5a 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h
@@ -252,10 +252,6 @@ enum gaudi_async_event_id {
 	GAUDI_EVENT_HBM3_SPI_0 = 407,
 	GAUDI_EVENT_HBM3_SPI_1 = 408,
 	GAUDI_EVENT_PSOC_GPIO_U16_0 = 421,
-	GAUDI_EVENT_PI_UPDATE = 484,
-	GAUDI_EVENT_HALT_MACHINE = 485,
-	GAUDI_EVENT_INTS_REGISTER = 486,
-	GAUDI_EVENT_SOFT_RESET = 487,
 	GAUDI_EVENT_RAZWI_OR_ADC = 548,
 	GAUDI_EVENT_TPC0_QM = 572,
 	GAUDI_EVENT_TPC1_QM = 573,
@@ -303,6 +299,10 @@ enum gaudi_async_event_id {
 	GAUDI_EVENT_NIC3_QP1 = 619,
 	GAUDI_EVENT_NIC4_QP0 = 620,
 	GAUDI_EVENT_NIC4_QP1 = 621,
+	GAUDI_EVENT_PI_UPDATE = 635,
+	GAUDI_EVENT_HALT_MACHINE = 636,
+	GAUDI_EVENT_INTS_REGISTER = 637,
+	GAUDI_EVENT_SOFT_RESET = 638,
 	GAUDI_EVENT_FW_ALIVE_S = 645,
 	GAUDI_EVENT_DEV_RESET_REQ = 646,
 	GAUDI_EVENT_PKT_QUEUE_OUT_SYNC = 647,
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_async_ids_map_extended.h b/drivers/misc/habanalabs/include/gaudi/gaudi_async_ids_map_extended.h
index e87554ab0102..ac4d4b51da7f 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_async_ids_map_extended.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_async_ids_map_extended.h
@@ -508,10 +508,10 @@ static struct gaudi_async_events_ids_map gaudi_irq_map_table[] = {
 	{ .fc_id = 481, .cpu_id = 330, .valid = 0, .name = "" },
 	{ .fc_id = 482, .cpu_id = 331, .valid = 0, .name = "" },
 	{ .fc_id = 483, .cpu_id = 332, .valid = 0, .name = "" },
-	{ .fc_id = 484, .cpu_id = 333, .valid = 1, .name = "PI_UPDATE" },
-	{ .fc_id = 485, .cpu_id = 334, .valid = 1, .name = "HALT_MACHINE" },
-	{ .fc_id = 486, .cpu_id = 335, .valid = 1, .name = "INTS_REGISTER" },
-	{ .fc_id = 487, .cpu_id = 336, .valid = 1, .name = "SOFT_RESET" },
+	{ .fc_id = 484, .cpu_id = 333, .valid = 0, .name = "" },
+	{ .fc_id = 485, .cpu_id = 334, .valid = 0, .name = "" },
+	{ .fc_id = 486, .cpu_id = 335, .valid = 0, .name = "" },
+	{ .fc_id = 487, .cpu_id = 336, .valid = 0, .name = "" },
 	{ .fc_id = 488, .cpu_id = 337, .valid = 0, .name = "" },
 	{ .fc_id = 489, .cpu_id = 338, .valid = 0, .name = "" },
 	{ .fc_id = 490, .cpu_id = 339, .valid = 0, .name = "" },
@@ -659,10 +659,10 @@ static struct gaudi_async_events_ids_map gaudi_irq_map_table[] = {
 	{ .fc_id = 632, .cpu_id = 481, .valid = 0, .name = "" },
 	{ .fc_id = 633, .cpu_id = 482, .valid = 0, .name = "" },
 	{ .fc_id = 634, .cpu_id = 483, .valid = 0, .name = "" },
-	{ .fc_id = 635, .cpu_id = 484, .valid = 0, .name = "" },
-	{ .fc_id = 636, .cpu_id = 485, .valid = 0, .name = "" },
-	{ .fc_id = 637, .cpu_id = 486, .valid = 0, .name = "" },
-	{ .fc_id = 638, .cpu_id = 487, .valid = 0, .name = "" },
+	{ .fc_id = 635, .cpu_id = 484, .valid = 1, .name = "PI_UPDATE" },
+	{ .fc_id = 636, .cpu_id = 485, .valid = 1, .name = "HALT_MACHINE" },
+	{ .fc_id = 637, .cpu_id = 486, .valid = 1, .name = "INTS_REGISTER" },
+	{ .fc_id = 638, .cpu_id = 487, .valid = 1, .name = "SOFT_RESET" },
 	{ .fc_id = 639, .cpu_id = 488, .valid = 0, .name = "" },
 	{ .fc_id = 640, .cpu_id = 489, .valid = 0, .name = "" },
 	{ .fc_id = 641, .cpu_id = 490, .valid = 0, .name = "" },
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/3] habanalabs: fix typo
  2021-06-17 14:40 [PATCH 1/3] habanalabs/gaudi: correct driver events numbering Oded Gabbay
@ 2021-06-17 14:40 ` Oded Gabbay
  2021-06-17 14:40 ` [PATCH 3/3] debugfs: add skip_reset_on_timeout option Oded Gabbay
  1 sibling, 0 replies; 3+ messages in thread
From: Oded Gabbay @ 2021-06-17 14:40 UTC (permalink / raw)
  To: linux-kernel; +Cc: Zvika Yehudai

From: Zvika Yehudai <zyehudai@habana.ai>

fix a type in comment

Signed-off-by: Zvika Yehudai <zyehudai@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index e545574b1d38..f8b7080e0570 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2119,7 +2119,7 @@ struct hl_mmu_funcs {
  * @kernel_queues: array of hl_hw_queue.
  * @cs_mirror_list: CS mirror list for TDR.
  * @cs_mirror_lock: protects cs_mirror_list.
- * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs.
+ * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CBs.
  * @event_queue: event queue for IRQ from CPU-CP.
  * @dma_pool: DMA pool for small allocations.
  * @cpu_accessible_dma_mem: Host <-> CPU-CP shared memory CPU address.
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 3/3] debugfs: add skip_reset_on_timeout option
  2021-06-17 14:40 [PATCH 1/3] habanalabs/gaudi: correct driver events numbering Oded Gabbay
  2021-06-17 14:40 ` [PATCH 2/3] habanalabs: fix typo Oded Gabbay
@ 2021-06-17 14:40 ` Oded Gabbay
  1 sibling, 0 replies; 3+ messages in thread
From: Oded Gabbay @ 2021-06-17 14:40 UTC (permalink / raw)
  To: linux-kernel; +Cc: Yuri Nudelman

From: Yuri Nudelman <ynudelman@habana.ai>

To be able to debug long-running CS better, without changing the
userspace code, we are adding a new option through debugfs interface
to skip the reset of the device in case of CS timeout.

Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 Documentation/ABI/testing/debugfs-driver-habanalabs | 8 ++++++++
 drivers/misc/habanalabs/common/command_submission.c | 1 +
 drivers/misc/habanalabs/common/debugfs.c            | 5 +++++
 drivers/misc/habanalabs/common/habanalabs.h         | 3 +++
 4 files changed, 17 insertions(+)

diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs
index c78fc9282876..e78ceb1f70b3 100644
--- a/Documentation/ABI/testing/debugfs-driver-habanalabs
+++ b/Documentation/ABI/testing/debugfs-driver-habanalabs
@@ -207,6 +207,14 @@ Contact:        ogabbay@kernel.org
 Description:    Sets the PCI power state. Valid values are "1" for D0 and "2"
                 for D3Hot
 
+What:           /sys/kernel/debug/habanalabs/hl<n>/skip_reset_on_timeout
+Date:           Jun 2021
+KernelVersion:  5.13
+Contact:        ynudelman@habana.ai
+Description:    Sets the skip reset on timeout option for the device. Value of
+                "0" means device will be reset in case some CS has timed out,
+                otherwise it will not be reset.
+
 What:           /sys/kernel/debug/habanalabs/hl<n>/stop_on_err
 Date:           Mar 2020
 KernelVersion:  5.6
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 6d51f54030c1..adedb288d452 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -663,6 +663,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP);
 	cs->timeout_jiffies = timeout;
 	cs->skip_reset_on_timeout =
+		hdev->skip_reset_on_timeout ||
 		!!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT);
 	cs->submission_time_jiffies = jiffies;
 	INIT_LIST_HEAD(&cs->job_list);
diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 8381155578a0..703d79fb6f3f 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -1278,6 +1278,11 @@ void hl_debugfs_add_device(struct hl_device *hdev)
 				dev_entry->root,
 				&dev_entry->blob_desc);
 
+	debugfs_create_x8("skip_reset_on_timeout",
+				0644,
+				dev_entry->root,
+				&hdev->skip_reset_on_timeout);
+
 	for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
 		debugfs_create_file(hl_debugfs_list[i].name,
 					0444,
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index f8b7080e0570..12d9dc42e05e 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2222,6 +2222,8 @@ struct hl_mmu_funcs {
  * @supports_staged_submission: true if staged submissions are supported
  * @curr_reset_cause: saves an enumerated reset cause when a hard reset is
  *                    triggered, and cleared after it is shared with preboot.
+ * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to
+ *                         complete instead.
  */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -2337,6 +2339,7 @@ struct hl_device {
 	u8				device_fini_pending;
 	u8				supports_staged_submission;
 	u8				curr_reset_cause;
+	u8				skip_reset_on_timeout;
 
 	/* Parameters for bring-up */
 	u64				nic_ports_mask;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-06-17 14:40 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-17 14:40 [PATCH 1/3] habanalabs/gaudi: correct driver events numbering Oded Gabbay
2021-06-17 14:40 ` [PATCH 2/3] habanalabs: fix typo Oded Gabbay
2021-06-17 14:40 ` [PATCH 3/3] debugfs: add skip_reset_on_timeout option Oded Gabbay

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.