All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] habanalabs: allow reset upon device release
@ 2021-06-21  7:24 Oded Gabbay
  2021-06-21  7:24 ` [PATCH 2/5] habanalabs: get lower/upper 32 bits via masking Oded Gabbay
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Oded Gabbay @ 2021-06-21  7:24 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

We introduce a new type of reset which is reset upon device release.
This reset is very similar to soft reset except the fact it is
performed only upon device release and not upon user sysfs request
nor TDR.

The purpose of this reset is to make sure the device is returned to
IDLE state after the current user has finished working with the device.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c     | 25 +++++++++++++++++----
 drivers/misc/habanalabs/common/habanalabs.h |  7 ++++++
 drivers/misc/habanalabs/common/sysfs.c      |  2 +-
 drivers/misc/habanalabs/goya/goya.c         |  1 +
 4 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 37ce38d9a1a7..ff4cbde289c0 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -86,7 +86,7 @@ static void hpriv_release(struct kref *ref)
 
 	if ((hdev->reset_if_device_not_idle && !device_is_idle)
 			|| hdev->reset_upon_device_release)
-		hl_device_reset(hdev, 0);
+		hl_device_reset(hdev, HL_RESET_DEVICE_RELEASE);
 }
 
 void hl_hpriv_get(struct hl_fpriv *hpriv)
@@ -885,7 +885,7 @@ static void device_disable_open_processes(struct hl_device *hdev)
 int hl_device_reset(struct hl_device *hdev, u32 flags)
 {
 	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
-	bool hard_reset, from_hard_reset_thread;
+	bool hard_reset, from_hard_reset_thread, hard_instead_soft = false;
 	int i, rc;
 
 	if (!hdev->init_done) {
@@ -897,11 +897,28 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	hard_reset = (flags & HL_RESET_HARD) != 0;
 	from_hard_reset_thread = (flags & HL_RESET_FROM_RESET_THREAD) != 0;
 
-	if ((!hard_reset) && (!hdev->supports_soft_reset)) {
-		dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
+	if (!hard_reset && !hdev->supports_soft_reset) {
+		hard_instead_soft = true;
 		hard_reset = true;
 	}
 
+	if (hdev->reset_upon_device_release &&
+			(flags & HL_RESET_DEVICE_RELEASE)) {
+		dev_dbg(hdev->dev,
+			"Perform %s-reset upon device release\n",
+			hard_reset ? "hard" : "soft");
+		goto do_reset;
+	}
+
+	if (!hard_reset && !hdev->allow_external_soft_reset) {
+		hard_instead_soft = true;
+		hard_reset = true;
+	}
+
+	if (hard_instead_soft)
+		dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
+
+do_reset:
 	/* Re-entry of reset thread */
 	if (from_hard_reset_thread && hdev->process_kill_trial_cnt)
 		goto kill_processes;
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 09b89fdeba0b..fad112a01009 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -119,11 +119,15 @@ enum hl_mmu_page_table_location {
  *
  * - HL_RESET_TDR
  *       Set if reset is due to TDR
+ *
+ * - HL_RESET_DEVICE_RELEASE
+ *       Set if reset is due to device release
  */
 #define HL_RESET_HARD			(1 << 0)
 #define HL_RESET_FROM_RESET_THREAD	(1 << 1)
 #define HL_RESET_HEARTBEAT		(1 << 2)
 #define HL_RESET_TDR			(1 << 3)
+#define HL_RESET_DEVICE_RELEASE		(1 << 4)
 
 #define HL_MAX_SOBS_PER_MONITOR	8
 
@@ -2181,6 +2185,8 @@ struct hl_mmu_funcs {
  * @collective_mon_idx: helper index for collective initialization
  * @supports_coresight: is CoreSight supported.
  * @supports_soft_reset: is soft reset supported.
+ * @allow_external_soft_reset: true if soft reset initiated by user or TDR is
+ *                             allowed.
  * @supports_cb_mapping: is mapping a CB to the device's MMU supported.
  * @needs_reset: true if reset_on_lockup is false and device should be reset
  *               due to lockup.
@@ -2301,6 +2307,7 @@ struct hl_device {
 	u8				collective_mon_idx;
 	u8				supports_coresight;
 	u8				supports_soft_reset;
+	u8				allow_external_soft_reset;
 	u8				supports_cb_mapping;
 	u8				needs_reset;
 	u8				process_kill_trial_cnt;
diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c
index c9f649b31e3a..db72df282ef8 100644
--- a/drivers/misc/habanalabs/common/sysfs.c
+++ b/drivers/misc/habanalabs/common/sysfs.c
@@ -208,7 +208,7 @@ static ssize_t soft_reset_store(struct device *dev,
 		goto out;
 	}
 
-	if (!hdev->supports_soft_reset) {
+	if (!hdev->allow_external_soft_reset) {
 		dev_err(hdev->dev, "Device does not support soft-reset\n");
 		goto out;
 	}
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 5a837c0b4d76..06f5f1439e69 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -954,6 +954,7 @@ static int goya_sw_init(struct hl_device *hdev)
 	spin_lock_init(&goya->hw_queues_lock);
 	hdev->supports_coresight = true;
 	hdev->supports_soft_reset = true;
+	hdev->allow_external_soft_reset = true;
 
 	goya_set_pci_memory_regions(hdev);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/5] habanalabs: get lower/upper 32 bits via masking
  2021-06-21  7:24 [PATCH 1/5] habanalabs: allow reset upon device release Oded Gabbay
@ 2021-06-21  7:24 ` Oded Gabbay
  2021-06-21  7:24 ` [PATCH 3/5] habanalabs: add validity check for signal cs Oded Gabbay
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Oded Gabbay @ 2021-06-21  7:24 UTC (permalink / raw)
  To: linux-kernel; +Cc: Koby Elbaz

From: Koby Elbaz <kelbaz@habana.ai>

fix multiple similar occurrences of the following sparse warning:
'warning: cast truncates bits from constant value
(7ffc113000 becomes fc113000)'

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 2 +-
 drivers/misc/habanalabs/goya/goya.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index be830948e051..4a75df240cfc 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -3858,7 +3858,7 @@ static void gaudi_init_static_firmware_loader(struct hl_device *hdev)
 	static_loader->boot_err1_reg = mmCPU_BOOT_ERR1;
 	static_loader->preboot_version_offset_reg = mmPREBOOT_VER_OFFSET;
 	static_loader->boot_fit_version_offset_reg = mmUBOOT_VER_OFFSET;
-	static_loader->sram_offset_mask = ~((u32)SRAM_BASE_ADDR);
+	static_loader->sram_offset_mask = ~(lower_32_bits(SRAM_BASE_ADDR));
 }
 
 static void gaudi_init_firmware_loader(struct hl_device *hdev)
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 06f5f1439e69..755e08cf2ecc 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -2484,7 +2484,7 @@ static void goya_init_static_firmware_loader(struct hl_device *hdev)
 	static_loader->boot_err1_reg = mmCPU_BOOT_ERR1;
 	static_loader->preboot_version_offset_reg = mmPREBOOT_VER_OFFSET;
 	static_loader->boot_fit_version_offset_reg = mmUBOOT_VER_OFFSET;
-	static_loader->sram_offset_mask = ~((u32)SRAM_BASE_ADDR);
+	static_loader->sram_offset_mask = ~(lower_32_bits(SRAM_BASE_ADDR));
 }
 
 static void goya_init_firmware_loader(struct hl_device *hdev)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/5] habanalabs: add validity check for signal cs
  2021-06-21  7:24 [PATCH 1/5] habanalabs: allow reset upon device release Oded Gabbay
  2021-06-21  7:24 ` [PATCH 2/5] habanalabs: get lower/upper 32 bits via masking Oded Gabbay
@ 2021-06-21  7:24 ` Oded Gabbay
  2021-06-21  7:24 ` [PATCH 4/5] habanalabs/gaudi: add support for NIC DERR Oded Gabbay
  2021-06-21  7:24 ` [PATCH 5/5] habanalabs/gaudi: refactor hard-reset related code Oded Gabbay
  3 siblings, 0 replies; 5+ messages in thread
From: Oded Gabbay @ 2021-06-21  7:24 UTC (permalink / raw)
  To: linux-kernel; +Cc: farah kassabri

From: farah kassabri <fkassabri@habana.ai>

In preparation for a new feature that allows the user to reserve
signals ahead of submissions, we need to change a current assumption
in the code.

Currently, the driver uses 2 SOBs to support signal CS. When the first
SOB reaches max value, the driver switches to the other one and assumes
that when it will need to switch back to the first one, all of the
signals have already been handled.

This assumption won't hold when the new feature will be added, because
using signal reservation, the driver can reach the max SOB value very
fast.

The change is to add a validity check when submitting a signal CS, to
make sure the previous SOB is available (all the signals attached to
it indeed finished).

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 55 +++++++++++++++++++
 drivers/misc/habanalabs/common/habanalabs.h   |  2 +
 drivers/misc/habanalabs/common/hw_queue.c     | 42 ++++++--------
 3 files changed, 75 insertions(+), 24 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index adedb288d452..80c60fb41bbc 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -1497,6 +1497,61 @@ static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
 	return rc;
 }
 
+/*
+ * hl_cs_signal_sob_wraparound_handler: handle SOB value wrapaound case.
+ * if the SOB value reaches the max value move to the other SOB reserved
+ * to the queue.
+ * Note that this function must be called while hw_queues_lock is taken.
+ */
+int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
+			struct hl_hw_sob **hw_sob, u32 count)
+{
+	struct hl_sync_stream_properties *prop;
+	struct hl_hw_sob *sob = *hw_sob, *other_sob;
+	u8 other_sob_offset;
+
+	prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
+
+	kref_get(&sob->kref);
+
+	/* check for wraparound */
+	if (prop->next_sob_val + count >= HL_MAX_SOB_VAL) {
+		/*
+		 * Decrement as we reached the max value.
+		 * The release function won't be called here as we've
+		 * just incremented the refcount right before calling this
+		 * function.
+		 */
+		kref_put(&sob->kref, hl_sob_reset_error);
+
+		/*
+		 * check the other sob value, if it still in use then fail
+		 * otherwise make the switch
+		 */
+		other_sob_offset = (prop->curr_sob_offset + 1) % HL_RSVD_SOBS;
+		other_sob = &prop->hw_sob[other_sob_offset];
+
+		if (kref_read(&other_sob->kref) != 1) {
+			dev_err(hdev->dev, "error: Cannot switch SOBs q_idx: %d\n",
+								q_idx);
+			return -EINVAL;
+		}
+
+		prop->next_sob_val = 1;
+
+		/* only two SOBs are currently in use */
+		prop->curr_sob_offset = other_sob_offset;
+		*hw_sob = other_sob;
+
+		dev_dbg(hdev->dev, "switched to SOB %d, q_idx: %d\n",
+				prop->curr_sob_offset, q_idx);
+	} else {
+		prop->next_sob_val += count;
+	}
+
+	return 0;
+}
+
 static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
 		struct hl_cs_chunk *chunk, u64 *signal_seq, struct hl_ctx *ctx)
 {
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index fad112a01009..98aa8524a6a6 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2642,6 +2642,8 @@ int hl_set_voltage(struct hl_device *hdev,
 int hl_set_current(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value);
 void hl_release_pending_user_interrupts(struct hl_device *hdev);
+int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
+			struct hl_hw_sob **hw_sob, u32 count);
 
 #ifdef CONFIG_DEBUG_FS
 
diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c
index 173438461835..bcabfdbf1e01 100644
--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -410,19 +410,20 @@ static void hw_queue_schedule_job(struct hl_cs_job *job)
 	ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
 }
 
-static void init_signal_cs(struct hl_device *hdev,
+static int init_signal_cs(struct hl_device *hdev,
 		struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl)
 {
 	struct hl_sync_stream_properties *prop;
 	struct hl_hw_sob *hw_sob;
 	u32 q_idx;
+	int rc = 0;
 
 	q_idx = job->hw_queue_id;
 	prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
 	hw_sob = &prop->hw_sob[prop->curr_sob_offset];
 
 	cs_cmpl->hw_sob = hw_sob;
-	cs_cmpl->sob_val = prop->next_sob_val++;
+	cs_cmpl->sob_val = prop->next_sob_val;
 
 	dev_dbg(hdev->dev,
 		"generate signal CB, sob_id: %d, sob val: 0x%x, q_idx: %d\n",
@@ -434,24 +435,9 @@ static void init_signal_cs(struct hl_device *hdev,
 	hdev->asic_funcs->gen_signal_cb(hdev, job->patched_cb,
 				cs_cmpl->hw_sob->sob_id, 0, true);
 
-	kref_get(&hw_sob->kref);
+	rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, 1);
 
-	/* check for wraparound */
-	if (prop->next_sob_val == HL_MAX_SOB_VAL) {
-		/*
-		 * Decrement as we reached the max value.
-		 * The release function won't be called here as we've
-		 * just incremented the refcount.
-		 */
-		kref_put(&hw_sob->kref, hl_sob_reset_error);
-		prop->next_sob_val = 1;
-		/* only two SOBs are currently in use */
-		prop->curr_sob_offset =
-			(prop->curr_sob_offset + 1) % HL_RSVD_SOBS;
-
-		dev_dbg(hdev->dev, "switched to SOB %d, q_idx: %d\n",
-				prop->curr_sob_offset, q_idx);
-	}
+	return rc;
 }
 
 static void init_wait_cs(struct hl_device *hdev, struct hl_cs *cs,
@@ -504,22 +490,25 @@ static void init_wait_cs(struct hl_device *hdev, struct hl_cs *cs,
  *
  * H/W queues spinlock should be taken before calling this function
  */
-static void init_signal_wait_cs(struct hl_cs *cs)
+static int init_signal_wait_cs(struct hl_cs *cs)
 {
 	struct hl_ctx *ctx = cs->ctx;
 	struct hl_device *hdev = ctx->hdev;
 	struct hl_cs_job *job;
 	struct hl_cs_compl *cs_cmpl =
 			container_of(cs->fence, struct hl_cs_compl, base_fence);
+	int rc = 0;
 
 	/* There is only one job in a signal/wait CS */
 	job = list_first_entry(&cs->job_list, struct hl_cs_job,
 				cs_node);
 
 	if (cs->type & CS_TYPE_SIGNAL)
-		init_signal_cs(hdev, job, cs_cmpl);
+		rc = init_signal_cs(hdev, job, cs_cmpl);
 	else if (cs->type & CS_TYPE_WAIT)
 		init_wait_cs(hdev, cs, job, cs_cmpl);
+
+	return rc;
 }
 
 /*
@@ -590,11 +579,16 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 		}
 	}
 
-	if ((cs->type == CS_TYPE_SIGNAL) || (cs->type == CS_TYPE_WAIT))
-		init_signal_wait_cs(cs);
-	else if (cs->type == CS_TYPE_COLLECTIVE_WAIT)
+	if ((cs->type == CS_TYPE_SIGNAL) || (cs->type == CS_TYPE_WAIT)) {
+		rc = init_signal_wait_cs(cs);
+		if (rc) {
+			dev_err(hdev->dev, "Failed to submit signal cs\n");
+			goto unroll_cq_resv;
+		}
+	} else if (cs->type == CS_TYPE_COLLECTIVE_WAIT)
 		hdev->asic_funcs->collective_wait_init_cs(cs);
 
+
 	spin_lock(&hdev->cs_mirror_lock);
 
 	/* Verify staged CS exists and add to the staged list */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 4/5] habanalabs/gaudi: add support for NIC DERR
  2021-06-21  7:24 [PATCH 1/5] habanalabs: allow reset upon device release Oded Gabbay
  2021-06-21  7:24 ` [PATCH 2/5] habanalabs: get lower/upper 32 bits via masking Oded Gabbay
  2021-06-21  7:24 ` [PATCH 3/5] habanalabs: add validity check for signal cs Oded Gabbay
@ 2021-06-21  7:24 ` Oded Gabbay
  2021-06-21  7:24 ` [PATCH 5/5] habanalabs/gaudi: refactor hard-reset related code Oded Gabbay
  3 siblings, 0 replies; 5+ messages in thread
From: Oded Gabbay @ 2021-06-21  7:24 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

We add support for NIC DERR ECC error events, in case this error
is received a device reset will be performed.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c             |  1 +
 .../habanalabs/include/gaudi/gaudi_async_events.h |  5 +++++
 .../include/gaudi/gaudi_async_ids_map_extended.h  | 15 ++++++++++-----
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 4a75df240cfc..82d5613f291b 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7870,6 +7870,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 	case GAUDI_EVENT_DMA_IF0_DERR ... GAUDI_EVENT_DMA_IF3_DERR:
 	case GAUDI_EVENT_HBM_0_DERR ... GAUDI_EVENT_HBM_3_DERR:
 	case GAUDI_EVENT_MMU_DERR:
+	case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR:
 		gaudi_print_irq_info(hdev, event_type, true);
 		gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
 		goto reset_device;
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h b/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h
index 2aee18e19b5a..d966bd4dfea6 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h
@@ -252,6 +252,11 @@ enum gaudi_async_event_id {
 	GAUDI_EVENT_HBM3_SPI_0 = 407,
 	GAUDI_EVENT_HBM3_SPI_1 = 408,
 	GAUDI_EVENT_PSOC_GPIO_U16_0 = 421,
+	GAUDI_EVENT_NIC0_CS_DBG_DERR = 483,
+	GAUDI_EVENT_NIC1_CS_DBG_DERR = 487,
+	GAUDI_EVENT_NIC2_CS_DBG_DERR = 491,
+	GAUDI_EVENT_NIC3_CS_DBG_DERR = 495,
+	GAUDI_EVENT_NIC4_CS_DBG_DERR = 499,
 	GAUDI_EVENT_RAZWI_OR_ADC = 548,
 	GAUDI_EVENT_TPC0_QM = 572,
 	GAUDI_EVENT_TPC1_QM = 573,
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_async_ids_map_extended.h b/drivers/misc/habanalabs/include/gaudi/gaudi_async_ids_map_extended.h
index ac4d4b51da7f..479b6b038254 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_async_ids_map_extended.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_async_ids_map_extended.h
@@ -507,23 +507,28 @@ static struct gaudi_async_events_ids_map gaudi_irq_map_table[] = {
 	{ .fc_id = 480, .cpu_id = 329, .valid = 0, .name = "" },
 	{ .fc_id = 481, .cpu_id = 330, .valid = 0, .name = "" },
 	{ .fc_id = 482, .cpu_id = 331, .valid = 0, .name = "" },
-	{ .fc_id = 483, .cpu_id = 332, .valid = 0, .name = "" },
+	{ .fc_id = 483, .cpu_id = 332, .valid = 1,
+		.name = "NIC0_CS_DBG_DERR" },
 	{ .fc_id = 484, .cpu_id = 333, .valid = 0, .name = "" },
 	{ .fc_id = 485, .cpu_id = 334, .valid = 0, .name = "" },
 	{ .fc_id = 486, .cpu_id = 335, .valid = 0, .name = "" },
-	{ .fc_id = 487, .cpu_id = 336, .valid = 0, .name = "" },
+	{ .fc_id = 487, .cpu_id = 336, .valid = 1,
+		.name = "NIC1_CS_DBG_DERR" },
 	{ .fc_id = 488, .cpu_id = 337, .valid = 0, .name = "" },
 	{ .fc_id = 489, .cpu_id = 338, .valid = 0, .name = "" },
 	{ .fc_id = 490, .cpu_id = 339, .valid = 0, .name = "" },
-	{ .fc_id = 491, .cpu_id = 340, .valid = 0, .name = "" },
+	{ .fc_id = 491, .cpu_id = 340, .valid = 1,
+		.name = "NIC2_CS_DBG_DERR" },
 	{ .fc_id = 492, .cpu_id = 341, .valid = 0, .name = "" },
 	{ .fc_id = 493, .cpu_id = 342, .valid = 0, .name = "" },
 	{ .fc_id = 494, .cpu_id = 343, .valid = 0, .name = "" },
-	{ .fc_id = 495, .cpu_id = 344, .valid = 0, .name = "" },
+	{ .fc_id = 495, .cpu_id = 344, .valid = 1,
+		.name = "NIC3_CS_DBG_DERR" },
 	{ .fc_id = 496, .cpu_id = 345, .valid = 0, .name = "" },
 	{ .fc_id = 497, .cpu_id = 346, .valid = 0, .name = "" },
 	{ .fc_id = 498, .cpu_id = 347, .valid = 0, .name = "" },
-	{ .fc_id = 499, .cpu_id = 348, .valid = 0, .name = "" },
+	{ .fc_id = 499, .cpu_id = 348, .valid = 1,
+		.name = "NIC4_CS_DBG_DERR" },
 	{ .fc_id = 500, .cpu_id = 349, .valid = 0, .name = "" },
 	{ .fc_id = 501, .cpu_id = 350, .valid = 0, .name = "" },
 	{ .fc_id = 502, .cpu_id = 351, .valid = 0, .name = "" },
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 5/5] habanalabs/gaudi: refactor hard-reset related code
  2021-06-21  7:24 [PATCH 1/5] habanalabs: allow reset upon device release Oded Gabbay
                   ` (2 preceding siblings ...)
  2021-06-21  7:24 ` [PATCH 4/5] habanalabs/gaudi: add support for NIC DERR Oded Gabbay
@ 2021-06-21  7:24 ` Oded Gabbay
  3 siblings, 0 replies; 5+ messages in thread
From: Oded Gabbay @ 2021-06-21  7:24 UTC (permalink / raw)
  To: linux-kernel; +Cc: Koby Elbaz

From: Koby Elbaz <kelbaz@habana.ai>

There is code related to hard-reset, which is done in gaudi specific
code. However, this code can be used by future ASICs and therefore it
is better to move it to the common code section.

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c | 41 +++++++++++++++++
 drivers/misc/habanalabs/common/habanalabs.h  |  9 ++++
 drivers/misc/habanalabs/gaudi/gaudi.c        | 48 +++-----------------
 drivers/misc/habanalabs/gaudi/gaudiP.h       |  5 --
 4 files changed, 56 insertions(+), 47 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index ce87053d4fde..2e4d04ec6b53 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -972,6 +972,47 @@ int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power)
 	return rc;
 }
 
+void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev)
+{
+	struct static_fw_load_mgr *static_loader =
+			&hdev->fw_loader.static_loader;
+	int rc;
+
+	if (hdev->asic_prop.dynamic_fw_load) {
+		rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader,
+				COMMS_RST_DEV, 0, false,
+				hdev->fw_loader.cpu_timeout);
+		if (rc)
+			dev_warn(hdev->dev, "Failed sending COMMS_RST_DEV\n");
+	} else {
+		WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_RST_DEV);
+	}
+}
+
+void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev)
+{
+	struct static_fw_load_mgr *static_loader =
+			&hdev->fw_loader.static_loader;
+	int rc;
+
+	if (hdev->device_cpu_is_halted)
+		return;
+
+	/* Stop device CPU to make sure nothing bad happens */
+	if (hdev->asic_prop.dynamic_fw_load) {
+		rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader,
+				COMMS_GOTO_WFE, 0, true,
+				hdev->fw_loader.cpu_timeout);
+		if (rc)
+			dev_warn(hdev->dev, "Failed sending COMMS_GOTO_WFE\n");
+	} else {
+		WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_GOTO_WFE);
+		msleep(static_loader->cpu_reset_wait_msec);
+	}
+
+	hdev->device_cpu_is_halted = true;
+}
+
 static void detect_cpu_boot_status(struct hl_device *hdev, u32 status)
 {
 	/* Some of the status codes below are deprecated in newer f/w
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 98aa8524a6a6..6b3cdd7e068a 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -894,6 +894,7 @@ struct pci_mem_region {
  * @preboot_version_offset_reg: SRAM offset to preboot version register
  * @boot_fit_version_offset_reg: SRAM offset to boot fit version register
  * @sram_offset_mask: mask for getting offset into the SRAM
+ * @cpu_reset_wait_msec: used when setting WFE via kmd_msg_to_cpu_reg
  */
 struct static_fw_load_mgr {
 	u64 preboot_version_max_off;
@@ -908,6 +909,7 @@ struct static_fw_load_mgr {
 	u32 preboot_version_offset_reg;
 	u32 boot_fit_version_offset_reg;
 	u32 sram_offset_mask;
+	u32 cpu_reset_wait_msec;
 };
 
 /**
@@ -2199,6 +2201,10 @@ struct hl_mmu_funcs {
  *                    triggered, and cleared after it is shared with preboot.
  * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to
  *                         complete instead.
+ * @device_cpu_is_halted: Flag to indicate whether the device CPU was already
+ *                        halted. We can't halt it again because the COMMS
+ *                        protocol will throw an error. Relevant only for
+ *                        cases where Linux was not loaded to device CPU
  */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -2315,6 +2321,7 @@ struct hl_device {
 	u8				supports_staged_submission;
 	u8				curr_reset_cause;
 	u8				skip_reset_on_timeout;
+	u8				device_cpu_is_halted;
 
 	/* Parameters for bring-up */
 	u64				nic_ports_mask;
@@ -2596,6 +2603,8 @@ int get_used_pll_index(struct hl_device *hdev, u32 input_pll_index,
 int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u32 pll_index,
 		u16 *pll_freq_arr);
 int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power);
+void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev);
+void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev);
 int hl_fw_init_cpu(struct hl_device *hdev);
 int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg,
 				u32 sts_boot_dev_sts0_reg,
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 82d5613f291b..aa8a0ca5aca2 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -1934,45 +1934,6 @@ static void gaudi_disable_msi(struct hl_device *hdev)
 	gaudi->hw_cap_initialized &= ~HW_CAP_MSI;
 }
 
-static void gaudi_ask_hard_reset_without_linux(struct hl_device *hdev)
-{
-	int rc;
-
-	if (hdev->asic_prop.dynamic_fw_load) {
-		rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader,
-				COMMS_RST_DEV, 0, false,
-				hdev->fw_loader.cpu_timeout);
-		if (rc)
-			dev_warn(hdev->dev, "Failed sending COMMS_RST_DEV\n");
-	} else {
-		WREG32(mmPSOC_GLOBAL_CONF_KMD_MSG_TO_CPU, KMD_MSG_RST_DEV);
-	}
-}
-
-static void gaudi_ask_halt_machine_without_linux(struct hl_device *hdev)
-{
-	struct gaudi_device *gaudi = hdev->asic_specific;
-	int rc;
-
-	if (gaudi && gaudi->device_cpu_is_halted)
-		return;
-
-	/* Stop device CPU to make sure nothing bad happens */
-	if (hdev->asic_prop.dynamic_fw_load) {
-		rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader,
-				COMMS_GOTO_WFE, 0, true,
-				hdev->fw_loader.cpu_timeout);
-		if (rc)
-			dev_warn(hdev->dev, "Failed sending COMMS_GOTO_WFE\n");
-	} else {
-		WREG32(mmPSOC_GLOBAL_CONF_KMD_MSG_TO_CPU, KMD_MSG_GOTO_WFE);
-		msleep(GAUDI_CPU_RESET_WAIT_MSEC);
-	}
-
-	if (gaudi)
-		gaudi->device_cpu_is_halted = true;
-}
-
 static void gaudi_init_scrambler_sram(struct hl_device *hdev)
 {
 	struct gaudi_device *gaudi = hdev->asic_specific;
@@ -3859,6 +3820,9 @@ static void gaudi_init_static_firmware_loader(struct hl_device *hdev)
 	static_loader->preboot_version_offset_reg = mmPREBOOT_VER_OFFSET;
 	static_loader->boot_fit_version_offset_reg = mmUBOOT_VER_OFFSET;
 	static_loader->sram_offset_mask = ~(lower_32_bits(SRAM_BASE_ADDR));
+	static_loader->cpu_reset_wait_msec = hdev->pldm ?
+			GAUDI_PLDM_RESET_WAIT_MSEC :
+			GAUDI_CPU_RESET_WAIT_MSEC;
 }
 
 static void gaudi_init_firmware_loader(struct hl_device *hdev)
@@ -4151,9 +4115,9 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
 			gaudi_irq_map_table[GAUDI_EVENT_HALT_MACHINE].cpu_id);
 	} else {
 		if (hdev->asic_prop.hard_reset_done_by_fw)
-			gaudi_ask_hard_reset_without_linux(hdev);
+			hl_fw_ask_hard_reset_without_linux(hdev);
 		else
-			gaudi_ask_halt_machine_without_linux(hdev);
+			hl_fw_ask_halt_machine_without_linux(hdev);
 	}
 
 	if (driver_performs_reset) {
@@ -4228,7 +4192,7 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
 
 		memset(gaudi->events_stat, 0, sizeof(gaudi->events_stat));
 
-		gaudi->device_cpu_is_halted = false;
+		hdev->device_cpu_is_halted = false;
 	}
 }
 
diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h
index b23336af191e..957bf3720f70 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -315,10 +315,6 @@ struct gaudi_internal_qman_info {
  *                  Multi MSI is possible only with IOMMU enabled.
  * @mmu_cache_inv_pi: PI for MMU cache invalidation flow. The H/W expects an
  *                    8-bit value so use u8.
- * @device_cpu_is_halted: Flag to indicate whether the device CPU was already
- *                        halted. We can't halt it again because the COMMS
- *                        protocol will throw an error. Relevant only for
- *                        cases where Linux was not loaded to device CPU
  */
 struct gaudi_device {
 	int (*cpucp_info_get)(struct hl_device *hdev);
@@ -340,7 +336,6 @@ struct gaudi_device {
 	u32				hw_cap_initialized;
 	u8				multi_msi_mode;
 	u8				mmu_cache_inv_pi;
-	u8				device_cpu_is_halted;
 };
 
 void gaudi_init_security(struct hl_device *hdev);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-06-21  7:24 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-21  7:24 [PATCH 1/5] habanalabs: allow reset upon device release Oded Gabbay
2021-06-21  7:24 ` [PATCH 2/5] habanalabs: get lower/upper 32 bits via masking Oded Gabbay
2021-06-21  7:24 ` [PATCH 3/5] habanalabs: add validity check for signal cs Oded Gabbay
2021-06-21  7:24 ` [PATCH 4/5] habanalabs/gaudi: add support for NIC DERR Oded Gabbay
2021-06-21  7:24 ` [PATCH 5/5] habanalabs/gaudi: refactor hard-reset related code Oded Gabbay

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.