linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 01/10] habanalabs: don't clear previous f/w indications
@ 2021-11-07 20:51 Oded Gabbay
  2021-11-07 20:51 ` [PATCH 02/10] habanalabs: handle abort scenario for user interrupt Oded Gabbay
                   ` (8 more replies)
  0 siblings, 9 replies; 10+ messages in thread
From: Oded Gabbay @ 2021-11-07 20:51 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ohad Sharabi

From: Ohad Sharabi <osharabi@habana.ai>

Once we read indication of whether f/w is doing the reset, we don't
want to clear it, until the next time we read this indication.

Otherwise, we might be in a state of wrong indication.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index c68ad4d7b1bb..9addcfba6a8b 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -1247,8 +1247,7 @@ static void hl_fw_preboot_update_state(struct hl_device *hdev)
 	 * 3. FW application - a. Fetch fw application security status
 	 *                     b. Check whether hard reset is done by fw app
 	 */
-	prop->hard_reset_done_by_fw =
-		!!(cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_FW_HARD_RST_EN);
+	prop->hard_reset_done_by_fw = !!(cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_FW_HARD_RST_EN);
 
 	dev_dbg(hdev->dev, "Firmware preboot boot device status0 %#x\n",
 							cpu_boot_dev_sts0);
@@ -1915,17 +1914,13 @@ static void hl_fw_boot_fit_update_state(struct hl_device *hdev,
 
 	hdev->fw_loader.fw_comp_loaded |= FW_TYPE_BOOT_CPU;
 
-	/* Clear reset status since we need to read it again from boot CPU */
-	prop->hard_reset_done_by_fw = false;
-
 	/* Read boot_cpu status bits */
 	if (prop->fw_preboot_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_ENABLED) {
 		prop->fw_bootfit_cpu_boot_dev_sts0 =
 				RREG32(cpu_boot_dev_sts0_reg);
 
-		if (prop->fw_bootfit_cpu_boot_dev_sts0 &
-				CPU_BOOT_DEV_STS0_FW_HARD_RST_EN)
-			prop->hard_reset_done_by_fw = true;
+		prop->hard_reset_done_by_fw = !!(prop->fw_bootfit_cpu_boot_dev_sts0 &
+							CPU_BOOT_DEV_STS0_FW_HARD_RST_EN);
 
 		dev_dbg(hdev->dev, "Firmware boot CPU status0 %#x\n",
 					prop->fw_bootfit_cpu_boot_dev_sts0);
@@ -2125,16 +2120,12 @@ static void hl_fw_linux_update_state(struct hl_device *hdev,
 
 	hdev->fw_loader.fw_comp_loaded |= FW_TYPE_LINUX;
 
-	/* Clear reset status since we need to read again from app */
-	prop->hard_reset_done_by_fw = false;
-
 	/* Read FW application security bits */
 	if (prop->fw_cpu_boot_dev_sts0_valid) {
 		prop->fw_app_cpu_boot_dev_sts0 = RREG32(cpu_boot_dev_sts0_reg);
 
-		if (prop->fw_app_cpu_boot_dev_sts0 &
-				CPU_BOOT_DEV_STS0_FW_HARD_RST_EN)
-			prop->hard_reset_done_by_fw = true;
+		prop->hard_reset_done_by_fw = !!(prop->fw_app_cpu_boot_dev_sts0 &
+							CPU_BOOT_DEV_STS0_FW_HARD_RST_EN);
 
 		if (prop->fw_app_cpu_boot_dev_sts0 &
 				CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 02/10] habanalabs: handle abort scenario for user interrupt
  2021-11-07 20:51 [PATCH 01/10] habanalabs: don't clear previous f/w indications Oded Gabbay
@ 2021-11-07 20:51 ` Oded Gabbay
  2021-11-07 20:51 ` [PATCH 03/10] habanalabs: add dedicated message towards f/w to set power Oded Gabbay
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Oded Gabbay @ 2021-11-07 20:51 UTC (permalink / raw)
  To: linux-kernel; +Cc: Bharat Jauhari

From: Bharat Jauhari <bjauhari@habana.ai>

In case of device reset, the driver does a force trigger on all waiting
users to release them from waiting. However, the driver does not handle
error scenario while waiting.

hl_interrupt_wait_ioctl() now exits the wait in case of an error with
abort status.

Signed-off-by: Bharat Jauhari <bjauhari@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 34 +++++++++----------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 4c8000fd246c..41b48929cd59 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -2768,7 +2768,7 @@ static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 				u32 timeout_us, u64 user_address,
 				u64 target_value, u16 interrupt_offset,
-				enum hl_cs_wait_status *status,
+				u32 *status,
 				u64 *timestamp)
 {
 	struct hl_user_pending_interrupt *pend;
@@ -2815,13 +2815,14 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 	}
 
 	if (completion_value >= target_value) {
-		*status = CS_WAIT_STATUS_COMPLETED;
+		*status = HL_WAIT_CS_STATUS_COMPLETED;
 		/* There was no interrupt, we assume the completion is now. */
 		pend->fence.timestamp = ktime_get();
-	} else
-		*status = CS_WAIT_STATUS_BUSY;
+	} else {
+		*status = HL_WAIT_CS_STATUS_BUSY;
+	}
 
-	if (!timeout_us || (*status == CS_WAIT_STATUS_COMPLETED))
+	if (!timeout_us || (*status == HL_WAIT_CS_STATUS_COMPLETED))
 		goto remove_pending_user_interrupt;
 
 wait_again:
@@ -2850,7 +2851,13 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 		}
 
 		if (completion_value >= target_value) {
-			*status = CS_WAIT_STATUS_COMPLETED;
+			*status = HL_WAIT_CS_STATUS_COMPLETED;
+		} else if (pend->fence.error) {
+			dev_err_ratelimited(hdev->dev,
+				"interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
+				pend->fence.error);
+			/* set the command completion status as ABORTED */
+			*status = HL_WAIT_CS_STATUS_ABORTED;
 		} else {
 			timeout = completion_rc;
 			goto wait_again;
@@ -2861,7 +2868,7 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 			interrupt->interrupt_id);
 		rc = -EINTR;
 	} else {
-		*status = CS_WAIT_STATUS_BUSY;
+		*status = HL_WAIT_CS_STATUS_BUSY;
 	}
 
 remove_pending_user_interrupt:
@@ -2883,7 +2890,7 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 	struct hl_device *hdev = hpriv->hdev;
 	struct asic_fixed_properties *prop;
 	union hl_wait_cs_args *args = data;
-	enum hl_cs_wait_status status;
+	u32 status = HL_WAIT_CS_STATUS_BUSY;
 	u64 timestamp;
 	int rc;
 
@@ -2926,22 +2933,13 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 	}
 
 	memset(args, 0, sizeof(*args));
+	args->out.status = status;
 
 	if (timestamp) {
 		args->out.timestamp_nsec = timestamp;
 		args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
 	}
 
-	switch (status) {
-	case CS_WAIT_STATUS_COMPLETED:
-		args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
-		break;
-	case CS_WAIT_STATUS_BUSY:
-	default:
-		args->out.status = HL_WAIT_CS_STATUS_BUSY;
-		break;
-	}
-
 	return 0;
 }
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 03/10] habanalabs: add dedicated message towards f/w to set power
  2021-11-07 20:51 [PATCH 01/10] habanalabs: don't clear previous f/w indications Oded Gabbay
  2021-11-07 20:51 ` [PATCH 02/10] habanalabs: handle abort scenario for user interrupt Oded Gabbay
@ 2021-11-07 20:51 ` Oded Gabbay
  2021-11-07 20:51 ` [PATCH 04/10] habanalabs: rename reset flags Oded Gabbay
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Oded Gabbay @ 2021-11-07 20:51 UTC (permalink / raw)
  To: linux-kernel; +Cc: Rajaravi Krishna Katta

From: Rajaravi Krishna Katta <rkatta@habana.ai>

CPUCP_PACKET_POWER_GET packet type was used for both
hl_get_power() and hl_set_power().

To align with other sensor functions hl_set_power()
should use CPUCP_PACKET_POWER_SET.

This packet will only be used with newer ASICs, so need to add
a compatibility flag to the asic properties to indicate whether to use
this packet or the GET packet.

Signed-off-by: Rajaravi Krishna Katta <rkatta@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs.h       | 3 +++
 drivers/misc/habanalabs/common/hwmon.c            | 8 +++++++-
 drivers/misc/habanalabs/gaudi/gaudi.c             | 2 ++
 drivers/misc/habanalabs/goya/goya.c               | 2 ++
 drivers/misc/habanalabs/include/common/cpucp_if.h | 4 ++++
 5 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 5fc9cfd892e8..dc61f7031c38 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -546,6 +546,8 @@ struct hl_hints_range {
  * @dynamic_fw_load: is dynamic FW load is supported.
  * @gic_interrupts_enable: true if FW is not blocking GIC controller,
  *                         false otherwise.
+ * @use_get_power_for_reset_history: To support backward compatibility for Goya
+ *                                   and Gaudi
  */
 struct asic_fixed_properties {
 	struct hw_queue_properties	*hw_queues_props;
@@ -626,6 +628,7 @@ struct asic_fixed_properties {
 	u8				iatu_done_by_fw;
 	u8				dynamic_fw_load;
 	u8				gic_interrupts_enable;
+	u8				use_get_power_for_reset_history;
 };
 
 /**
diff --git a/drivers/misc/habanalabs/common/hwmon.c b/drivers/misc/habanalabs/common/hwmon.c
index e33f65be8a00..70182b42940d 100644
--- a/drivers/misc/habanalabs/common/hwmon.c
+++ b/drivers/misc/habanalabs/common/hwmon.c
@@ -677,12 +677,18 @@ int hl_set_power(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value)
 {
 	struct cpucp_packet pkt;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_GET <<
+	if (prop->use_get_power_for_reset_history)
+		pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_GET <<
 				CPUCP_PKT_CTL_OPCODE_SHIFT);
+	else
+		pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
+
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 	pkt.value = __cpu_to_le64(value);
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 1dcce1bc976f..738ad2498439 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -665,6 +665,8 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 	prop->clk_pll_index = HL_GAUDI_MME_PLL;
 	prop->max_freq_value = GAUDI_MAX_CLK_FREQ;
 
+	prop->use_get_power_for_reset_history = true;
+
 	return 0;
 }
 
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index ce06103292a0..959eb21dcc69 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -475,6 +475,8 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 
 	prop->clk_pll_index = HL_GOYA_MME_PLL;
 
+	prop->use_get_power_for_reset_history = true;
+
 	return 0;
 }
 
diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h
index ae13231fda94..17927968e19a 100644
--- a/drivers/misc/habanalabs/include/common/cpucp_if.h
+++ b/drivers/misc/habanalabs/include/common/cpucp_if.h
@@ -376,6 +376,9 @@ enum pq_init_status {
  *       and QMANs. The f/w will return a bitmask where each bit represents
  *       a different engine or QMAN according to enum cpucp_idle_mask.
  *       The bit will be 1 if the engine is NOT idle.
+ *
+ * CPUCP_PACKET_POWER_SET -
+ *       Resets power history of device to 0
  */
 
 enum cpucp_packet_id {
@@ -421,6 +424,7 @@ enum cpucp_packet_id {
 	CPUCP_PACKET_NIC_STAT_REGS_CLR,		/* internal */
 	CPUCP_PACKET_NIC_STAT_REGS_ALL_GET,	/* internal */
 	CPUCP_PACKET_IS_IDLE_CHECK,		/* internal */
+	CPUCP_PACKET_POWER_SET,			/* internal */
 };
 
 #define CPUCP_PACKET_FENCE_VAL	0xFE8CE7A5
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 04/10] habanalabs: rename reset flags
  2021-11-07 20:51 [PATCH 01/10] habanalabs: don't clear previous f/w indications Oded Gabbay
  2021-11-07 20:51 ` [PATCH 02/10] habanalabs: handle abort scenario for user interrupt Oded Gabbay
  2021-11-07 20:51 ` [PATCH 03/10] habanalabs: add dedicated message towards f/w to set power Oded Gabbay
@ 2021-11-07 20:51 ` Oded Gabbay
  2021-11-07 20:51 ` [PATCH 05/10] habanalabs: change wait for interrupt timeout to 64 bit Oded Gabbay
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Oded Gabbay @ 2021-11-07 20:51 UTC (permalink / raw)
  To: linux-kernel; +Cc: Bharat Jauhari

From: Bharat Jauhari <bjauhari@habana.ai>

Rename reset flags for better readability as compared to
HL_RESET_CAUSE* enum shared with the f/w.

Signed-off-by: Bharat Jauhari <bjauhari@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    |  2 +-
 drivers/misc/habanalabs/common/device.c       | 40 +++++++++----------
 drivers/misc/habanalabs/common/habanalabs.h   | 28 ++++++-------
 drivers/misc/habanalabs/common/memory.c       |  2 +-
 drivers/misc/habanalabs/common/sysfs.c        |  2 +-
 drivers/misc/habanalabs/gaudi/gaudi.c         | 14 ++++---
 drivers/misc/habanalabs/goya/goya.c           | 10 ++---
 7 files changed, 50 insertions(+), 48 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 41b48929cd59..9ebcd9894d83 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -767,7 +767,7 @@ static void cs_timedout(struct work_struct *work)
 
 	if (likely(!skip_reset_on_timeout)) {
 		if (hdev->reset_on_lockup)
-			hl_device_reset(hdev, HL_RESET_TDR);
+			hl_device_reset(hdev, HL_DRV_RESET_TDR);
 		else
 			hdev->needs_reset = true;
 	}
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 9674e2520532..eb5800b403b6 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -95,7 +95,7 @@ static void hpriv_release(struct kref *ref)
 
 	if ((hdev->reset_if_device_not_idle && !device_is_idle)
 			|| hdev->reset_upon_device_release)
-		hl_device_reset(hdev, HL_RESET_DEVICE_RELEASE);
+		hl_device_reset(hdev, HL_DRV_RESET_DEV_RELEASE);
 
 	/* Now we can mark the compute_ctx as empty. Even if a reset is running in a different
 	 * thread, we don't care because the in_reset is marked so if a user will try to open
@@ -330,10 +330,10 @@ static void device_hard_reset_pending(struct work_struct *work)
 	u32 flags;
 	int rc;
 
-	flags = HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD;
+	flags = HL_DRV_RESET_HARD | HL_DRV_RESET_FROM_RESET_THR;
 
 	if (device_reset_work->fw_reset)
-		flags |= HL_RESET_FW;
+		flags |= HL_DRV_RESET_BYPASS_REQ_TO_FW;
 
 	rc = hl_device_reset(hdev, flags);
 	if ((rc == -EBUSY) && !hdev->device_fini_pending) {
@@ -541,7 +541,7 @@ static void hl_device_heartbeat(struct work_struct *work)
 		goto reschedule;
 
 	dev_err(hdev->dev, "Device heartbeat failed!\n");
-	hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_HEARTBEAT);
+	hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT);
 
 	return;
 
@@ -552,7 +552,7 @@ static void hl_device_heartbeat(struct work_struct *work)
 	 * If control reached here, then at least one heartbeat work has been
 	 * scheduled since last reset/init cycle.
 	 * So if the device is not already in reset cycle, reset the flag
-	 * prev_reset_trigger as no reset occurred with HL_RESET_FW_FATAL_ERR
+	 * prev_reset_trigger as no reset occurred with HL_DRV_RESET_FW_FATAL_ERR
 	 * status for at least one heartbeat. From this point driver restarts
 	 * tracking future consecutive fatal errors.
 	 */
@@ -831,7 +831,7 @@ int hl_device_resume(struct hl_device *hdev)
 	hdev->disabled = false;
 	atomic_set(&hdev->in_reset, 0);
 
-	rc = hl_device_reset(hdev, HL_RESET_HARD);
+	rc = hl_device_reset(hdev, HL_DRV_RESET_HARD);
 	if (rc) {
 		dev_err(hdev->dev, "Failed to reset device during resume\n");
 		goto disable_device;
@@ -948,15 +948,15 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
 	 * ('in_reset' makes sure of it). This makes sure that
 	 * 'reset_cause' will continue holding its 1st recorded reason!
 	 */
-	if (flags & HL_RESET_HEARTBEAT) {
+	if (flags & HL_DRV_RESET_HEARTBEAT) {
 		hdev->curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
-		cur_reset_trigger = HL_RESET_HEARTBEAT;
-	} else if (flags & HL_RESET_TDR) {
+		cur_reset_trigger = HL_DRV_RESET_HEARTBEAT;
+	} else if (flags & HL_DRV_RESET_TDR) {
 		hdev->curr_reset_cause = HL_RESET_CAUSE_TDR;
-		cur_reset_trigger = HL_RESET_TDR;
-	} else if (flags & HL_RESET_FW_FATAL_ERR) {
+		cur_reset_trigger = HL_DRV_RESET_TDR;
+	} else if (flags & HL_DRV_RESET_FW_FATAL_ERR) {
 		hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
-		cur_reset_trigger = HL_RESET_FW_FATAL_ERR;
+		cur_reset_trigger = HL_DRV_RESET_FW_FATAL_ERR;
 	} else {
 		hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
 	}
@@ -979,8 +979,8 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
 	 * If F/W is performing the reset, no need to send it a message to disable
 	 * PCI access
 	 */
-	if ((flags & HL_RESET_HARD) &&
-			!(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) {
+	if ((flags & HL_DRV_RESET_HARD) &&
+			!(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
 		/* Disable PCI access from device F/W so he won't send
 		 * us additional interrupts. We disable MSI/MSI-X at
 		 * the halt_engines function and we can't have the F/W
@@ -1025,9 +1025,9 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		return 0;
 	}
 
-	hard_reset = !!(flags & HL_RESET_HARD);
-	from_hard_reset_thread = !!(flags & HL_RESET_FROM_RESET_THREAD);
-	fw_reset = !!(flags & HL_RESET_FW);
+	hard_reset = !!(flags & HL_DRV_RESET_HARD);
+	from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR);
+	fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW);
 
 	if (!hard_reset && !hdev->supports_soft_reset) {
 		hard_instead_soft = true;
@@ -1035,7 +1035,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	}
 
 	if (hdev->reset_upon_device_release &&
-			(flags & HL_RESET_DEVICE_RELEASE)) {
+			(flags & HL_DRV_RESET_DEV_RELEASE)) {
 		dev_dbg(hdev->dev,
 			"Perform %s-reset upon device release\n",
 			hard_reset ? "hard" : "soft");
@@ -1075,7 +1075,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 
 		if (hard_reset)
 			dev_info(hdev->dev, "Going to reset device\n");
-		else if (flags & HL_RESET_DEVICE_RELEASE)
+		else if (flags & HL_DRV_RESET_DEV_RELEASE)
 			dev_info(hdev->dev,
 				"Going to reset device after it was released by user\n");
 		else
@@ -1171,7 +1171,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		hdev->hard_reset_pending = false;
 
 		if (hdev->reset_trigger_repeated &&
-				(hdev->prev_reset_trigger == HL_RESET_FW_FATAL_ERR)) {
+				(hdev->prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR)) {
 			/* if there 2 back to back resets from FW,
 			 * ensure driver puts the driver in a unusable state
 			 */
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index dc61f7031c38..92d12c8ba569 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -120,37 +120,37 @@ enum hl_mmu_page_table_location {
 /*
  * Reset Flags
  *
- * - HL_RESET_HARD
+ * - HL_DRV_RESET_HARD
  *       If set do hard reset to all engines. If not set reset just
  *       compute/DMA engines.
  *
- * - HL_RESET_FROM_RESET_THREAD
+ * - HL_DRV_RESET_FROM_RESET_THR
  *       Set if the caller is the hard-reset thread
  *
- * - HL_RESET_HEARTBEAT
+ * - HL_DRV_RESET_HEARTBEAT
  *       Set if reset is due to heartbeat
  *
- * - HL_RESET_TDR
+ * - HL_DRV_RESET_TDR
  *       Set if reset is due to TDR
  *
- * - HL_RESET_DEVICE_RELEASE
+ * - HL_DRV_RESET_DEV_RELEASE
  *       Set if reset is due to device release
  *
- * - HL_RESET_FW
+ * - HL_DRV_RESET_BYPASS_REQ_TO_FW
  *       F/W will perform the reset. No need to ask it to reset the device. This is relevant
  *       only when running with secured f/w
  *
- * - HL_RESET_FW_FATAL_ERR
+ * - HL_DRV_RESET_FW_FATAL_ERR
  *       Set if reset is due to a fatal error from FW
  */
 
-#define HL_RESET_HARD			(1 << 0)
-#define HL_RESET_FROM_RESET_THREAD	(1 << 1)
-#define HL_RESET_HEARTBEAT		(1 << 2)
-#define HL_RESET_TDR			(1 << 3)
-#define HL_RESET_DEVICE_RELEASE		(1 << 4)
-#define HL_RESET_FW			(1 << 5)
-#define HL_RESET_FW_FATAL_ERR		(1 << 6)
+#define HL_DRV_RESET_HARD		(1 << 0)
+#define HL_DRV_RESET_FROM_RESET_THR	(1 << 1)
+#define HL_DRV_RESET_HEARTBEAT		(1 << 2)
+#define HL_DRV_RESET_TDR		(1 << 3)
+#define HL_DRV_RESET_DEV_RELEASE	(1 << 4)
+#define HL_DRV_RESET_BYPASS_REQ_TO_FW	(1 << 5)
+#define HL_DRV_RESET_FW_FATAL_ERR	(1 << 6)
 
 #define HL_MAX_SOBS_PER_MONITOR	8
 
diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index cd3640617d02..530f8b4fadd2 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -316,7 +316,7 @@ static int free_phys_pg_pack(struct hl_device *hdev,
 	}
 
 	if (rc && !hdev->disabled)
-		hl_device_reset(hdev, HL_RESET_HARD);
+		hl_device_reset(hdev, HL_DRV_RESET_HARD);
 
 end:
 	kvfree(phys_pg_pack->pages);
diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c
index 42c1769ad25d..aee0cc4d6155 100644
--- a/drivers/misc/habanalabs/common/sysfs.c
+++ b/drivers/misc/habanalabs/common/sysfs.c
@@ -236,7 +236,7 @@ static ssize_t hard_reset_store(struct device *dev,
 
 	dev_warn(hdev->dev, "Hard-Reset requested through sysfs\n");
 
-	hl_device_reset(hdev, HL_RESET_HARD);
+	hl_device_reset(hdev, HL_DRV_RESET_HARD);
 
 out:
 	return count;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 738ad2498439..2724ab3747f2 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -8003,7 +8003,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 	case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR:
 		gaudi_print_irq_info(hdev, event_type, true);
 		gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
-		fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
+		fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
 		goto reset_device;
 
 	case GAUDI_EVENT_GIC500:
@@ -8011,7 +8011,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 	case GAUDI_EVENT_L2_RAM_ECC:
 	case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17:
 		gaudi_print_irq_info(hdev, event_type, false);
-		fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
+		fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
 		goto reset_device;
 
 	case GAUDI_EVENT_HBM0_SPI_0:
@@ -8022,7 +8022,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 		gaudi_hbm_read_interrupts(hdev,
 				gaudi_hbm_event_to_dev(event_type),
 				&eq_entry->hbm_ecc_data);
-		fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
+		fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
 		goto reset_device;
 
 	case GAUDI_EVENT_HBM0_SPI_1:
@@ -8205,9 +8205,11 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 
 reset_device:
 	if (hdev->asic_prop.fw_security_enabled)
-		hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FW | fw_fatal_err_flag);
+		hl_device_reset(hdev, HL_DRV_RESET_HARD
+					| HL_DRV_RESET_BYPASS_REQ_TO_FW
+					| fw_fatal_err_flag);
 	else if (hdev->hard_reset_on_fw_events)
-		hl_device_reset(hdev, HL_RESET_HARD | fw_fatal_err_flag);
+		hl_device_reset(hdev, HL_DRV_RESET_HARD | fw_fatal_err_flag);
 	else
 		hl_fw_unmask_irq(hdev, event_type);
 }
@@ -8260,7 +8262,7 @@ static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
 	if (rc) {
 		dev_err_ratelimited(hdev->dev,
 					"MMU cache invalidation timeout\n");
-		hl_device_reset(hdev, HL_RESET_HARD);
+		hl_device_reset(hdev, HL_DRV_RESET_HARD);
 	}
 
 	return rc;
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 959eb21dcc69..3bbcab7da25e 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -4838,14 +4838,14 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
 	case GOYA_ASYNC_EVENT_ID_L2_RAM_ECC:
 		goya_print_irq_info(hdev, event_type, false);
 		if (hdev->hard_reset_on_fw_events)
-			hl_device_reset(hdev, (HL_RESET_HARD |
-						HL_RESET_FW_FATAL_ERR));
+			hl_device_reset(hdev, (HL_DRV_RESET_HARD |
+						HL_DRV_RESET_FW_FATAL_ERR));
 		break;
 
 	case GOYA_ASYNC_EVENT_ID_PSOC_GPIO_05_SW_RESET:
 		goya_print_irq_info(hdev, event_type, false);
 		if (hdev->hard_reset_on_fw_events)
-			hl_device_reset(hdev, HL_RESET_HARD);
+			hl_device_reset(hdev, HL_DRV_RESET_HARD);
 		break;
 
 	case GOYA_ASYNC_EVENT_ID_PCIE_DEC:
@@ -4905,7 +4905,7 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
 		goya_print_irq_info(hdev, event_type, false);
 		goya_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
 		if (hdev->hard_reset_on_fw_events)
-			hl_device_reset(hdev, HL_RESET_HARD);
+			hl_device_reset(hdev, HL_DRV_RESET_HARD);
 		else
 			hl_fw_unmask_irq(hdev, event_type);
 		break;
@@ -5239,7 +5239,7 @@ static int goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
 	if (rc) {
 		dev_err_ratelimited(hdev->dev,
 					"MMU cache invalidation timeout\n");
-		hl_device_reset(hdev, HL_RESET_HARD);
+		hl_device_reset(hdev, HL_DRV_RESET_HARD);
 	}
 
 	return rc;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 05/10] habanalabs: change wait for interrupt timeout to 64 bit
  2021-11-07 20:51 [PATCH 01/10] habanalabs: don't clear previous f/w indications Oded Gabbay
                   ` (2 preceding siblings ...)
  2021-11-07 20:51 ` [PATCH 04/10] habanalabs: rename reset flags Oded Gabbay
@ 2021-11-07 20:51 ` Oded Gabbay
  2021-11-07 20:51 ` [PATCH 06/10] habanalabs: expand clock throttling information uAPI Oded Gabbay
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Oded Gabbay @ 2021-11-07 20:51 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

In order to increase maximum wait-for-interrupt timeout, change it
to 64 bit variable. This wait is used only by newer ASICs, so no
problem in changing this interface at this time.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 22 ++++++++++++++-----
 include/uapi/misc/habanalabs.h                | 18 +++++++++------
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 9ebcd9894d83..54a5425a77a0 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -2765,8 +2765,23 @@ static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 	return 0;
 }
 
+static inline unsigned long hl_usecs64_to_jiffies(const u64 usecs)
+{
+	if (usecs <= U32_MAX)
+		return usecs_to_jiffies(usecs);
+
+	/*
+	 * If the value in nanoseconds is larger than 64 bit, use the largest
+	 * 64 bit value.
+	 */
+	if (usecs >= ((u64)(U64_MAX / NSEC_PER_USEC)))
+		return nsecs_to_jiffies(U64_MAX);
+
+	return nsecs_to_jiffies(usecs * NSEC_PER_USEC);
+}
+
 static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
-				u32 timeout_us, u64 user_address,
+				u64 timeout_us, u64 user_address,
 				u64 target_value, u16 interrupt_offset,
 				u32 *status,
 				u64 *timestamp)
@@ -2778,10 +2793,7 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 	long completion_rc;
 	int rc = 0;
 
-	if (timeout_us == U32_MAX)
-		timeout = timeout_us;
-	else
-		timeout = usecs_to_jiffies(timeout_us);
+	timeout = hl_usecs64_to_jiffies(timeout_us);
 
 	hl_ctx_get(hdev, ctx);
 
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 00b309590499..c5760acebdd1 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -911,14 +911,18 @@ struct hl_wait_cs_in {
 	 */
 	__u32 flags;
 
-	/* Multi CS API info- valid entries in multi-CS array */
-	__u8 seq_arr_len;
-	__u8 pad[3];
+	union {
+		struct {
+			/* Multi CS API info- valid entries in multi-CS array */
+			__u8 seq_arr_len;
+			__u8 pad[7];
+		};
 
-	/* Absolute timeout to wait for an interrupt in microseconds.
-	 * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set
-	 */
-	__u32 interrupt_timeout_us;
+		/* Absolute timeout to wait for an interrupt in microseconds.
+		 * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set
+		 */
+		__u64 interrupt_timeout_us;
+	};
 };
 
 #define HL_WAIT_CS_STATUS_COMPLETED	0
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 06/10] habanalabs: expand clock throttling information uAPI
  2021-11-07 20:51 [PATCH 01/10] habanalabs: don't clear previous f/w indications Oded Gabbay
                   ` (3 preceding siblings ...)
  2021-11-07 20:51 ` [PATCH 05/10] habanalabs: change wait for interrupt timeout to 64 bit Oded Gabbay
@ 2021-11-07 20:51 ` Oded Gabbay
  2021-11-07 20:51 ` [PATCH 07/10] habanalabs/gaudi: Fix collective wait bug Oded Gabbay
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Oded Gabbay @ 2021-11-07 20:51 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

In addition to the clock throttling reason, user should be able
to obtain also the start time and the duration of the throttling
event.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c       |  3 ++
 drivers/misc/habanalabs/common/habanalabs.h   | 31 +++++++++++++++++--
 .../misc/habanalabs/common/habanalabs_ioctl.c | 27 ++++++++++++++--
 drivers/misc/habanalabs/gaudi/gaudi.c         | 22 ++++++++++---
 drivers/misc/habanalabs/goya/goya.c           | 25 ++++++++++++---
 include/uapi/misc/habanalabs.h                | 16 ++++++++--
 6 files changed, 110 insertions(+), 14 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index eb5800b403b6..0da5a55490ff 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -455,6 +455,7 @@ static int device_early_init(struct hl_device *hdev)
 	INIT_LIST_HEAD(&hdev->fpriv_list);
 	mutex_init(&hdev->fpriv_list_lock);
 	atomic_set(&hdev->in_reset, 0);
+	mutex_init(&hdev->clk_throttling.lock);
 
 	return 0;
 
@@ -495,6 +496,8 @@ static void device_early_fini(struct hl_device *hdev)
 
 	mutex_destroy(&hdev->fpriv_list_lock);
 
+	mutex_destroy(&hdev->clk_throttling.lock);
+
 	hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
 
 	kfree(hdev->hl_chip_info);
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 92d12c8ba569..fc201537f7a9 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2378,6 +2378,32 @@ struct multi_cs_data {
 	u8		update_ts;
 };
 
+/**
+ * struct hl_clk_throttle_timestamp - current/last clock throttling timestamp
+ * @start: timestamp taken when 'start' event is received in driver
+ * @end: timestamp taken when 'end' event is received in driver
+ */
+struct hl_clk_throttle_timestamp {
+	ktime_t		start;
+	ktime_t		end;
+};
+
+/**
+ * struct hl_clk_throttle - keeps current/last clock throttling timestamps
+ * @timestamp: timestamp taken by driver and firmware, index 0 refers to POWER
+ *             index 1 refers to THERMAL
+ * @lock: protects this structure as it can be accessed from both event queue
+ *        context and info_ioctl context
+ * @current_reason: bitmask represents the current clk throttling reasons
+ * @aggregated_reason: bitmask represents aggregated clk throttling reasons since driver load
+ */
+struct hl_clk_throttle {
+	struct hl_clk_throttle_timestamp timestamp[HL_CLK_THROTTLE_TYPE_MAX];
+	struct mutex	lock;
+	u32		current_reason;
+	u32		aggregated_reason;
+};
+
 /**
  * struct hl_device - habanalabs device structure.
  * @pdev: pointer to PCI device, can be NULL in case of simulator device.
@@ -2445,6 +2471,7 @@ struct multi_cs_data {
  * @pci_mem_region: array of memory regions in the PCI
  * @state_dump_specs: constants and dictionaries needed to dump system state.
  * @multi_cs_completion: array of multi-CS completion.
+ * @clk_throttling: holds information about current/previous clock throttling events
  * @dram_used_mem: current DRAM memory consumption.
  * @timeout_jiffies: device CS timeout value.
  * @max_power: the max power of the device, as configured by the sysadmin. This
@@ -2474,7 +2501,6 @@ struct multi_cs_data {
  * @high_pll: high PLL profile frequency.
  * @soft_reset_cnt: number of soft reset since the driver was loaded.
  * @hard_reset_cnt: number of hard reset since the driver was loaded.
- * @clk_throttling_reason: bitmask represents the current clk throttling reasons
  * @id: device minor.
  * @id_control: minor of the control device
  * @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
@@ -2604,6 +2630,8 @@ struct hl_device {
 
 	struct multi_cs_completion	multi_cs_completion[
 							MULTI_CS_MAX_USER_CTX];
+	struct hl_clk_throttle		clk_throttling;
+
 	u32				*stream_master_qid_arr;
 	atomic64_t			dram_used_mem;
 	u64				timeout_jiffies;
@@ -2622,7 +2650,6 @@ struct hl_device {
 	u32				high_pll;
 	u32				soft_reset_cnt;
 	u32				hard_reset_cnt;
-	u32				clk_throttling_reason;
 	u16				id;
 	u16				id_control;
 	u16				cpu_pci_msb_addr;
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index 86c3257d9ae1..19726c6b642a 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -313,15 +313,38 @@ static int pci_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 
 static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 {
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
 	struct hl_device *hdev = hpriv->hdev;
 	struct hl_info_clk_throttle clk_throttle = {0};
+	ktime_t end_time, zero_time = ktime_set(0, 0);
 	u32 max_size = args->return_size;
-	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	int i;
 
 	if ((!max_size) || (!out))
 		return -EINVAL;
 
-	clk_throttle.clk_throttling_reason = hdev->clk_throttling_reason;
+	mutex_lock(&hdev->clk_throttling.lock);
+
+	clk_throttle.clk_throttling_reason = hdev->clk_throttling.current_reason;
+
+	for (i = 0 ; i < HL_CLK_THROTTLE_TYPE_MAX ; i++) {
+		if (!(hdev->clk_throttling.aggregated_reason & BIT(i)))
+			continue;
+
+		clk_throttle.clk_throttling_timestamp_us[i] =
+			ktime_to_us(hdev->clk_throttling.timestamp[i].start);
+
+		if (ktime_compare(hdev->clk_throttling.timestamp[i].end, zero_time))
+			end_time = ktime_get();
+		else
+			end_time = hdev->clk_throttling.timestamp[i].end;
+
+		clk_throttle.clk_throttling_duration_ns[i] =
+			ktime_to_ns(ktime_sub(end_time,
+				hdev->clk_throttling.timestamp[i].start));
+
+	}
+	mutex_unlock(&hdev->clk_throttling.lock);
 
 	return copy_to_user(out, &clk_throttle,
 		min((size_t) max_size, sizeof(clk_throttle))) ? -EFAULT : 0;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 2724ab3747f2..b4814369062e 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7925,27 +7925,39 @@ static int tpc_krn_event_to_tpc_id(u16 tpc_dec_event_type)
 static void gaudi_print_clk_change_info(struct hl_device *hdev,
 					u16 event_type)
 {
+	ktime_t zero_time = ktime_set(0, 0);
+
+	mutex_lock(&hdev->clk_throttling.lock);
+
 	switch (event_type) {
 	case GAUDI_EVENT_FIX_POWER_ENV_S:
-		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get();
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time;
 		dev_info_ratelimited(hdev->dev,
 			"Clock throttling due to power consumption\n");
 		break;
 
 	case GAUDI_EVENT_FIX_POWER_ENV_E:
-		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get();
 		dev_info_ratelimited(hdev->dev,
 			"Power envelop is safe, back to optimal clock\n");
 		break;
 
 	case GAUDI_EVENT_FIX_THERMAL_ENV_S:
-		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
 		dev_info_ratelimited(hdev->dev,
 			"Clock throttling due to overheating\n");
 		break;
 
 	case GAUDI_EVENT_FIX_THERMAL_ENV_E:
-		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
 		dev_info_ratelimited(hdev->dev,
 			"Thermal envelop is safe, back to optimal clock\n");
 		break;
@@ -7955,6 +7967,8 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev,
 			event_type);
 		break;
 	}
+
+	mutex_unlock(&hdev->clk_throttling.lock);
 }
 
 static void gaudi_handle_eqe(struct hl_device *hdev,
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 3bbcab7da25e..7b3683f2a6dc 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -4768,24 +4768,39 @@ static int goya_unmask_irq(struct hl_device *hdev, u16 event_type)
 
 static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type)
 {
+	ktime_t zero_time = ktime_set(0, 0);
+
+	mutex_lock(&hdev->clk_throttling.lock);
+
 	switch (event_type) {
 	case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_S:
-		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get();
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time;
 		dev_info_ratelimited(hdev->dev,
 			"Clock throttling due to power consumption\n");
 		break;
+
 	case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_E:
-		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get();
 		dev_info_ratelimited(hdev->dev,
 			"Power envelop is safe, back to optimal clock\n");
 		break;
+
 	case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_S:
-		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
 		dev_info_ratelimited(hdev->dev,
 			"Clock throttling due to overheating\n");
 		break;
+
 	case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E:
-		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
 		dev_info_ratelimited(hdev->dev,
 			"Thermal envelop is safe, back to optimal clock\n");
 		break;
@@ -4795,6 +4810,8 @@ static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type)
 			event_type);
 		break;
 	}
+
+	mutex_unlock(&hdev->clk_throttling.lock);
 }
 
 void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index c5760acebdd1..257b9630773e 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -473,15 +473,27 @@ struct hl_info_pci_counters {
 	__u64 replay_cnt;
 };
 
-#define HL_CLK_THROTTLE_POWER	0x1
-#define HL_CLK_THROTTLE_THERMAL	0x2
+enum hl_clk_throttling_type {
+	HL_CLK_THROTTLE_TYPE_POWER,
+	HL_CLK_THROTTLE_TYPE_THERMAL,
+	HL_CLK_THROTTLE_TYPE_MAX
+};
+
+/* clk_throttling_reason masks */
+#define HL_CLK_THROTTLE_POWER		(1 << HL_CLK_THROTTLE_TYPE_POWER)
+#define HL_CLK_THROTTLE_THERMAL		(1 << HL_CLK_THROTTLE_TYPE_THERMAL)
 
 /**
  * struct hl_info_clk_throttle - clock throttling reason
  * @clk_throttling_reason: each bit represents a clk throttling reason
+ * @clk_throttling_timestamp_us: represents CPU timestamp in microseconds of the start-event
+ * @clk_throttling_duration_ns: the clock throttle time in nanosec
  */
 struct hl_info_clk_throttle {
 	__u32 clk_throttling_reason;
+	__u32 pad;
+	__u64 clk_throttling_timestamp_us[HL_CLK_THROTTLE_TYPE_MAX];
+	__u64 clk_throttling_duration_ns[HL_CLK_THROTTLE_TYPE_MAX];
 };
 
 /**
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 07/10] habanalabs/gaudi: Fix collective wait bug
  2021-11-07 20:51 [PATCH 01/10] habanalabs: don't clear previous f/w indications Oded Gabbay
                   ` (4 preceding siblings ...)
  2021-11-07 20:51 ` [PATCH 06/10] habanalabs: expand clock throttling information uAPI Oded Gabbay
@ 2021-11-07 20:51 ` Oded Gabbay
  2021-11-07 20:51 ` [PATCH 08/10] habanalabs: refactor wait-for-user-interrupt function Oded Gabbay
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Oded Gabbay @ 2021-11-07 20:51 UTC (permalink / raw)
  To: linux-kernel; +Cc: farah kassabri

From: farah kassabri <fkassabri@habana.ai>

In Signaling-From-Graph case, the driver didn't set the hw_sob pointer
at the right place, which is needed for the cs completion
check prior to start sending all the master/slaves jobs to device.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index b4814369062e..a9e279bfebae 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -1276,6 +1276,7 @@ static int gaudi_collective_wait_init_cs(struct hl_cs *cs)
 		container_of(cs->signal_fence, struct hl_cs_compl, base_fence);
 	struct hl_cs_compl *cs_cmpl =
 		container_of(cs->fence, struct hl_cs_compl, base_fence);
+	struct hl_cs_encaps_sig_handle *handle = cs->encaps_sig_hdl;
 	struct gaudi_collective_properties *cprop;
 	u32 stream, queue_id, sob_group_offset;
 	struct gaudi_device *gaudi;
@@ -1288,10 +1289,16 @@ static int gaudi_collective_wait_init_cs(struct hl_cs *cs)
 	gaudi = hdev->asic_specific;
 	cprop = &gaudi->collective_props;
 
-	/* In encaps signals case the SOB info will be retrieved from
-	 * the handle in gaudi_collective_slave_init_job.
-	 */
-	if (!cs->encaps_signals) {
+	if (cs->encaps_signals) {
+		cs_cmpl->hw_sob = handle->hw_sob;
+		/* at this checkpoint we only need the hw_sob pointer
+		 * for the completion check before start going over the jobs
+		 * of the master/slaves, the sob_value will be taken later on
+		 * in gaudi_collective_slave_init_job depends on each
+		 * job wait offset value.
+		 */
+		cs_cmpl->sob_val = 0;
+	} else {
 		/* copy the SOB id and value of the signal CS */
 		cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
 		cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 08/10] habanalabs: refactor wait-for-user-interrupt function
  2021-11-07 20:51 [PATCH 01/10] habanalabs: don't clear previous f/w indications Oded Gabbay
                   ` (5 preceding siblings ...)
  2021-11-07 20:51 ` [PATCH 07/10] habanalabs/gaudi: Fix collective wait bug Oded Gabbay
@ 2021-11-07 20:51 ` Oded Gabbay
  2021-11-07 20:51 ` [PATCH 09/10] habanalabs: add new opcodes for INFO IOCTL Oded Gabbay
  2021-11-07 20:51 ` [PATCH 10/10] habanalabs: make hdev creation code more readable Oded Gabbay
  8 siblings, 0 replies; 10+ messages in thread
From: Oded Gabbay @ 2021-11-07 20:51 UTC (permalink / raw)
  To: linux-kernel; +Cc: Bharat Jauhari

From: Bharat Jauhari <bjauhari@habana.ai>

Refactor the wait-for-user-interrupt routine to make it more
generic for re-use for other user exposed h/w interfaces in future
ASICs.

Signed-off-by: Bharat Jauhari <bjauhari@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 22 +++++++------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 54a5425a77a0..e97b21988dea 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -2782,12 +2782,12 @@ static inline unsigned long hl_usecs64_to_jiffies(const u64 usecs)
 
 static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 				u64 timeout_us, u64 user_address,
-				u64 target_value, u16 interrupt_offset,
+				u64 target_value, struct hl_user_interrupt *interrupt,
+
 				u32 *status,
 				u64 *timestamp)
 {
 	struct hl_user_pending_interrupt *pend;
-	struct hl_user_interrupt *interrupt;
 	unsigned long timeout, flags;
 	u64 completion_value;
 	long completion_rc;
@@ -2805,11 +2805,6 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 
 	hl_fence_init(&pend->fence, ULONG_MAX);
 
-	if (interrupt_offset == HL_COMMON_USER_INTERRUPT_ID)
-		interrupt = &hdev->common_user_interrupt;
-	else
-		interrupt = &hdev->user_interrupt[interrupt_offset];
-
 	/* Add pending user interrupt to relevant list for the interrupt
 	 * handler to monitor
 	 */
@@ -2898,9 +2893,10 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 
 static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 {
-	u16 interrupt_id, interrupt_offset, first_interrupt, last_interrupt;
+	u16 interrupt_id, first_interrupt, last_interrupt;
 	struct hl_device *hdev = hpriv->hdev;
 	struct asic_fixed_properties *prop;
+	struct hl_user_interrupt *interrupt;
 	union hl_wait_cs_args *args = data;
 	u32 status = HL_WAIT_CS_STATUS_BUSY;
 	u64 timestamp;
@@ -2913,8 +2909,7 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 		return -EPERM;
 	}
 
-	interrupt_id =
-		FIELD_GET(HL_WAIT_CS_FLAGS_INTERRUPT_MASK, args->in.flags);
+	interrupt_id = FIELD_GET(HL_WAIT_CS_FLAGS_INTERRUPT_MASK, args->in.flags);
 
 	first_interrupt = prop->first_available_user_msix_interrupt;
 	last_interrupt = prop->first_available_user_msix_interrupt +
@@ -2927,15 +2922,14 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 	}
 
 	if (interrupt_id == HL_COMMON_USER_INTERRUPT_ID)
-		interrupt_offset = HL_COMMON_USER_INTERRUPT_ID;
+		interrupt = &hdev->common_user_interrupt;
 	else
-		interrupt_offset = interrupt_id - first_interrupt;
+		interrupt = &hdev->user_interrupt[interrupt_id - first_interrupt];
 
 	rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx,
 				args->in.interrupt_timeout_us, args->in.addr,
-				args->in.target, interrupt_offset, &status,
+				args->in.target, interrupt, &status,
 				&timestamp);
-
 	if (rc) {
 		if (rc != -EINTR)
 			dev_err_ratelimited(hdev->dev,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 09/10] habanalabs: add new opcodes for INFO IOCTL
  2021-11-07 20:51 [PATCH 01/10] habanalabs: don't clear previous f/w indications Oded Gabbay
                   ` (6 preceding siblings ...)
  2021-11-07 20:51 ` [PATCH 08/10] habanalabs: refactor wait-for-user-interrupt function Oded Gabbay
@ 2021-11-07 20:51 ` Oded Gabbay
  2021-11-07 20:51 ` [PATCH 10/10] habanalabs: make hdev creation code more readable Oded Gabbay
  8 siblings, 0 replies; 10+ messages in thread
From: Oded Gabbay @ 2021-11-07 20:51 UTC (permalink / raw)
  To: linux-kernel; +Cc: farah kassabri

From: farah kassabri <fkassabri@habana.ai>

Add implementation for new opcodes in the INFO IOCTL:
1. Retrieve the replaced DRAM rows from f/w.
2. Retrieve the pending DRAM rows from f/w.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c  | 66 +++++++++++++++++++
 drivers/misc/habanalabs/common/habanalabs.h   |  3 +
 .../misc/habanalabs/common/habanalabs_ioctl.c | 43 ++++++++++++
 .../misc/habanalabs/include/common/cpucp_if.h | 33 +++++++++-
 include/uapi/misc/habanalabs.h                |  4 ++
 5 files changed, 148 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index 9addcfba6a8b..70e992bdbde7 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -972,6 +972,72 @@ int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power)
 	return rc;
 }
 
+int hl_fw_dram_replaced_row_get(struct hl_device *hdev,
+				struct cpucp_hbm_row_info *info)
+{
+	struct cpucp_hbm_row_info *cpucp_repl_rows_info_cpu_addr;
+	dma_addr_t cpucp_repl_rows_info_dma_addr;
+	struct cpucp_packet pkt = {};
+	u64 result;
+	int rc;
+
+	cpucp_repl_rows_info_cpu_addr =
+			hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev,
+					sizeof(struct cpucp_hbm_row_info),
+					&cpucp_repl_rows_info_dma_addr);
+	if (!cpucp_repl_rows_info_cpu_addr) {
+		dev_err(hdev->dev,
+			"Failed to allocate DMA memory for CPU-CP replaced rows info packet\n");
+		return -ENOMEM;
+	}
+
+	memset(cpucp_repl_rows_info_cpu_addr, 0, sizeof(struct cpucp_hbm_row_info));
+
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET <<
+					CPUCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.addr = cpu_to_le64(cpucp_repl_rows_info_dma_addr);
+	pkt.data_max_size = cpu_to_le32(sizeof(struct cpucp_hbm_row_info));
+
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to handle CPU-CP replaced rows info pkt, error %d\n", rc);
+		goto out;
+	}
+
+	memcpy(info, cpucp_repl_rows_info_cpu_addr, sizeof(*info));
+
+out:
+	hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev,
+					sizeof(struct cpucp_hbm_row_info),
+					cpucp_repl_rows_info_cpu_addr);
+
+	return rc;
+}
+
+int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num)
+{
+	struct cpucp_packet pkt;
+	u64 result;
+	int rc;
+
+	memset(&pkt, 0, sizeof(pkt));
+
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_HBM_PENDING_ROWS_STATUS << CPUCP_PKT_CTL_OPCODE_SHIFT);
+
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result);
+	if (rc) {
+		dev_err(hdev->dev,
+				"Failed to handle CPU-CP pending rows info pkt, error %d\n", rc);
+		goto out;
+	}
+
+	*pend_rows_num = (u32) result;
+out:
+	return rc;
+}
+
 void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev)
 {
 	struct static_fw_load_mgr *static_loader =
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index fc201537f7a9..a19563c416ac 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -3012,6 +3012,9 @@ int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev,
 				struct fw_load_mgr *fw_loader,
 				enum comms_cmd cmd, unsigned int size,
 				bool wait_ok, u32 timeout);
+int hl_fw_dram_replaced_row_get(struct hl_device *hdev,
+				struct cpucp_hbm_row_info *info);
+int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num);
 int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3],
 			bool is_wc[3]);
 int hl_pci_elbi_read(struct hl_device *hdev, u64 addr, u32 *data);
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index 19726c6b642a..68c655acdec8 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -503,6 +503,43 @@ static int open_stats_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 		min((size_t) max_size, sizeof(open_stats_info))) ? -EFAULT : 0;
 }
 
+static int dram_pending_rows_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	u32 max_size = args->return_size;
+	u32 pend_rows_num = 0;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	int rc;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	rc = hl_fw_dram_pending_row_get(hdev, &pend_rows_num);
+	if (rc)
+		return rc;
+
+	return copy_to_user(out, &pend_rows_num,
+			min_t(size_t, max_size, sizeof(pend_rows_num))) ? -EFAULT : 0;
+}
+
+static int dram_replaced_rows_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	u32 max_size = args->return_size;
+	struct cpucp_hbm_row_info info = {0};
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	int rc;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	rc = hl_fw_dram_replaced_row_get(hdev, &info);
+	if (rc)
+		return rc;
+
+	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
+}
+
 static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 				struct device *dev)
 {
@@ -589,6 +626,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_OPEN_STATS:
 		return open_stats_info(hpriv, args);
 
+	case HL_INFO_DRAM_REPLACED_ROWS:
+		return dram_replaced_rows_info(hpriv, args);
+
+	case HL_INFO_DRAM_PENDING_ROWS:
+		return dram_pending_rows_info(hpriv, args);
+
 	default:
 		dev_err(dev, "Invalid request %d\n", args->op);
 		rc = -ENOTTY;
diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h
index 17927968e19a..5e19c763f3f0 100644
--- a/drivers/misc/habanalabs/include/common/cpucp_if.h
+++ b/drivers/misc/habanalabs/include/common/cpucp_if.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0
  *
- * Copyright 2020 HabanaLabs, Ltd.
+ * Copyright 2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  *
  */
@@ -377,6 +377,13 @@ enum pq_init_status {
  *       a different engine or QMAN according to enum cpucp_idle_mask.
  *       The bit will be 1 if the engine is NOT idle.
  *
+ * CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET -
+ *       Fetch all HBM replaced-rows and prending to be replaced rows data.
+ *
+ * CPUCP_PACKET_HBM_PENDING_ROWS_STATUS -
+ *       Fetch status of HBM rows pending replacement and need a reboot to
+ *       be replaced.
+ *
  * CPUCP_PACKET_POWER_SET -
  *       Resets power history of device to 0
  */
@@ -424,6 +431,8 @@ enum cpucp_packet_id {
 	CPUCP_PACKET_NIC_STAT_REGS_CLR,		/* internal */
 	CPUCP_PACKET_NIC_STAT_REGS_ALL_GET,	/* internal */
 	CPUCP_PACKET_IS_IDLE_CHECK,		/* internal */
+	CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET,/* internal */
+	CPUCP_PACKET_HBM_PENDING_ROWS_STATUS,	/* internal */
 	CPUCP_PACKET_POWER_SET,			/* internal */
 };
 
@@ -692,6 +701,7 @@ struct eq_generic_event {
 #define CPUCP_MAX_NIC_LANES		(CPUCP_MAX_NICS * CPUCP_LANES_PER_NIC)
 #define CPUCP_NIC_MASK_ARR_LEN		((CPUCP_MAX_NICS + 63) / 64)
 #define CPUCP_NIC_POLARITY_ARR_LEN	((CPUCP_MAX_NIC_LANES + 63) / 64)
+#define CPUCP_HBM_ROW_REPLACE_MAX	32
 
 struct cpucp_sensor {
 	__le32 type;
@@ -837,4 +847,25 @@ struct cpucp_nic_status {
 	__le32 high_ber_cnt;
 };
 
+enum cpucp_hbm_row_replace_cause {
+	REPLACE_CAUSE_DOUBLE_ECC_ERR,
+	REPLACE_CAUSE_MULTI_SINGLE_ECC_ERR,
+};
+
+struct cpucp_hbm_row_info {
+	__u8 hbm_idx;
+	__u8 pc;
+	__u8 sid;
+	__u8 bank_idx;
+	__le16 row_addr;
+	__u8 replaced_row_cause; /* enum cpucp_hbm_row_replace_cause */
+	__u8 pad;
+};
+
+struct cpucp_hbm_row_replaced_rows_info {
+	__le16 num_replaced_rows;
+	__u8 pad[6];
+	struct cpucp_hbm_row_info replaced_rows[CPUCP_HBM_ROW_REPLACE_MAX];
+};
+
 #endif /* CPUCP_IF_H */
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 257b9630773e..9b4d72897061 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -334,6 +334,8 @@ enum hl_server_type {
  * HL_INFO_TOTAL_ENERGY  - Retrieve total energy consumption
  * HL_INFO_PLL_FREQUENCY - Retrieve PLL frequency
  * HL_INFO_OPEN_STATS    - Retrieve info regarding recent device open calls
+ * HL_INFO_DRAM_REPLACED_ROWS - Retrieve DRAM replaced rows info
+ * HL_INFO_DRAM_PENDING_ROWS - Retrieve DRAM pending rows num
  */
 #define HL_INFO_HW_IP_INFO		0
 #define HL_INFO_HW_EVENTS		1
@@ -353,6 +355,8 @@ enum hl_server_type {
 #define HL_INFO_PLL_FREQUENCY		16
 #define HL_INFO_POWER			17
 #define HL_INFO_OPEN_STATS		18
+#define HL_INFO_DRAM_REPLACED_ROWS	21
+#define HL_INFO_DRAM_PENDING_ROWS	22
 
 #define HL_INFO_VERSION_MAX_LEN	128
 #define HL_INFO_CARD_NAME_MAX_LEN	16
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 10/10] habanalabs: make hdev creation code more readable
  2021-11-07 20:51 [PATCH 01/10] habanalabs: don't clear previous f/w indications Oded Gabbay
                   ` (7 preceding siblings ...)
  2021-11-07 20:51 ` [PATCH 09/10] habanalabs: add new opcodes for INFO IOCTL Oded Gabbay
@ 2021-11-07 20:51 ` Oded Gabbay
  8 siblings, 0 replies; 10+ messages in thread
From: Oded Gabbay @ 2021-11-07 20:51 UTC (permalink / raw)
  To: linux-kernel

Divide the code into 3 different parts:
- Copy kernel parameters
- Setting device behaivor per asic
- Fixup of various device parameters according to the device behaivor.

In addition, remove non-relevant code for upstream (simulator support).

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs.h   |   6 +-
 .../misc/habanalabs/common/habanalabs_drv.c   | 123 +++++++++---------
 2 files changed, 61 insertions(+), 68 deletions(-)

diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index a19563c416ac..6b33fbd72fd8 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0
  *
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  *
  */
@@ -62,7 +62,6 @@
 #define HL_CPUCP_EEPROM_TIMEOUT_USEC	10000000 /* 10s */
 
 #define HL_FW_STATUS_POLL_INTERVAL_USEC		10000 /* 10ms */
-#define HL_FW_STATUS_PLDM_POLL_INTERVAL_USEC	300000000 /* 300s */
 
 #define HL_PCI_ELBI_TIMEOUT_MSEC	10 /* 10ms */
 
@@ -2823,9 +2822,6 @@ bool hl_device_operational(struct hl_device *hdev,
 		enum hl_device_status *status);
 enum hl_device_status hl_device_status(struct hl_device *hdev);
 int hl_device_set_debug_mode(struct hl_device *hdev, bool enable);
-int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
-		enum hl_asic_type asic_type, int minor);
-void destroy_hdev(struct hl_device *hdev);
 int hl_hw_queues_create(struct hl_device *hdev);
 void hl_hw_queues_destroy(struct hl_device *hdev);
 int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index 5989826701bc..85034f2f2e89 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /*
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  *
  */
@@ -263,6 +263,7 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp)
 
 static void set_driver_behavior_per_device(struct hl_device *hdev)
 {
+	hdev->pldm = 0;
 	hdev->fw_components = FW_TYPE_ALL_TYPES;
 	hdev->cpu_queues_enable = 1;
 	hdev->heartbeat = 1;
@@ -279,23 +280,53 @@ static void set_driver_behavior_per_device(struct hl_device *hdev)
 	hdev->axi_drain = 0;
 }
 
-/*
+static void copy_kernel_module_params_to_device(struct hl_device *hdev)
+{
+	hdev->major = hl_major;
+	hdev->memory_scrub = memory_scrub;
+	hdev->reset_on_lockup = reset_on_lockup;
+	hdev->boot_error_status_mask = boot_error_status_mask;
+
+	if (timeout_locked)
+		hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
+	else
+		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
+
+}
+
+static int fixup_device_params(struct hl_device *hdev)
+{
+	hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
+
+	hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
+
+	hdev->stop_on_err = true;
+	hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+	hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
+
+	/* Enable only after the initialization of the device */
+	hdev->disabled = true;
+
+	/* Set default DMA mask to 32 bits */
+	hdev->dma_mask = 32;
+
+	return 0;
+}
+
+/**
  * create_hdev - create habanalabs device instance
  *
  * @dev: will hold the pointer to the new habanalabs device structure
  * @pdev: pointer to the pci device
- * @asic_type: in case of simulator device, which device is it
- * @minor: in case of simulator device, the minor of the device
  *
  * Allocate memory for habanalabs device and initialize basic fields
  * Identify the ASIC type
  * Allocate ID (minor) for the device (only for real devices)
  */
-int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
-		enum hl_asic_type asic_type, int minor)
+static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
 {
+	int main_id, ctrl_id = 0, rc = 0;
 	struct hl_device *hdev;
-	int rc, main_id, ctrl_id = 0;
 
 	*dev = NULL;
 
@@ -303,72 +334,39 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 	if (!hdev)
 		return -ENOMEM;
 
-	/* First, we must find out which ASIC are we handling. This is needed
-	 * to configure the behavior of the driver (kernel parameters)
-	 */
-	if (pdev) {
-		hdev->asic_type = get_asic_type(pdev->device);
-		if (hdev->asic_type == ASIC_INVALID) {
-			dev_err(&pdev->dev, "Unsupported ASIC\n");
-			rc = -ENODEV;
-			goto free_hdev;
-		}
-	} else {
-		hdev->asic_type = asic_type;
-	}
-
-	if (pdev)
-		hdev->asic_prop.fw_security_enabled =
-					is_asic_secured(hdev->asic_type);
-	else
-		hdev->asic_prop.fw_security_enabled = false;
+	/* can be NULL in case of simulator device */
+	hdev->pdev = pdev;
 
 	/* Assign status description string */
-	strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL],
-					"operational", HL_STR_MAX);
-	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET],
-					"in reset", HL_STR_MAX);
-	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION],
-					"disabled", HL_STR_MAX);
-	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET],
-					"needs reset", HL_STR_MAX);
+	strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX);
+	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX);
+	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX);
+	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
 					"in device creation", HL_STR_MAX);
 
-	hdev->major = hl_major;
-	hdev->reset_on_lockup = reset_on_lockup;
-	hdev->memory_scrub = memory_scrub;
-	hdev->boot_error_status_mask = boot_error_status_mask;
-	hdev->stop_on_err = true;
+	/* First, we must find out which ASIC are we handling. This is needed
+	 * to configure the behavior of the driver (kernel parameters)
+	 */
+	hdev->asic_type = get_asic_type(pdev->device);
+	if (hdev->asic_type == ASIC_INVALID) {
+		dev_err(&pdev->dev, "Unsupported ASIC\n");
+		rc = -ENODEV;
+		goto free_hdev;
+	}
 
-	hdev->pldm = 0;
+	copy_kernel_module_params_to_device(hdev);
 
 	set_driver_behavior_per_device(hdev);
 
-	hdev->fw_poll_interval_usec = hdev->pldm ? HL_FW_STATUS_PLDM_POLL_INTERVAL_USEC :
-							HL_FW_STATUS_POLL_INTERVAL_USEC;
-
-	hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
-	hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
-
-	if (timeout_locked)
-		hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
-	else
-		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
-
-	hdev->disabled = true;
-	hdev->pdev = pdev; /* can be NULL in case of simulator device */
-
-	/* Set default DMA mask to 32 bits */
-	hdev->dma_mask = 32;
+	fixup_device_params(hdev);
 
 	mutex_lock(&hl_devs_idr_lock);
 
 	/* Always save 2 numbers, 1 for main device and 1 for control.
 	 * They must be consecutive
 	 */
-	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS,
-				GFP_KERNEL);
+	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL);
 
 	if (main_id >= 0)
 		ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
@@ -408,7 +406,7 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
  * @dev: pointer to the habanalabs device structure
  *
  */
-void destroy_hdev(struct hl_device *hdev)
+static void destroy_hdev(struct hl_device *hdev)
 {
 	/* Remove device from the device list */
 	mutex_lock(&hl_devs_idr_lock);
@@ -447,7 +445,7 @@ static int hl_pmops_resume(struct device *dev)
 	return hl_device_resume(hdev);
 }
 
-/*
+/**
  * hl_pci_probe - probe PCI habanalabs devices
  *
  * @pdev: pointer to pci device
@@ -457,8 +455,7 @@ static int hl_pmops_resume(struct device *dev)
  * Create a new habanalabs device and initialize it according to the
  * device's type
  */
-static int hl_pci_probe(struct pci_dev *pdev,
-				const struct pci_device_id *id)
+static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct hl_device *hdev;
 	int rc;
@@ -467,7 +464,7 @@ static int hl_pci_probe(struct pci_dev *pdev,
 		 " device found [%04x:%04x] (rev %x)\n",
 		 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
 
-	rc = create_hdev(&hdev, pdev, ASIC_INVALID, -1);
+	rc = create_hdev(&hdev, pdev);
 	if (rc)
 		return rc;
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-11-07 20:52 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-07 20:51 [PATCH 01/10] habanalabs: don't clear previous f/w indications Oded Gabbay
2021-11-07 20:51 ` [PATCH 02/10] habanalabs: handle abort scenario for user interrupt Oded Gabbay
2021-11-07 20:51 ` [PATCH 03/10] habanalabs: add dedicated message towards f/w to set power Oded Gabbay
2021-11-07 20:51 ` [PATCH 04/10] habanalabs: rename reset flags Oded Gabbay
2021-11-07 20:51 ` [PATCH 05/10] habanalabs: change wait for interrupt timeout to 64 bit Oded Gabbay
2021-11-07 20:51 ` [PATCH 06/10] habanalabs: expand clock throttling information uAPI Oded Gabbay
2021-11-07 20:51 ` [PATCH 07/10] habanalabs/gaudi: Fix collective wait bug Oded Gabbay
2021-11-07 20:51 ` [PATCH 08/10] habanalabs: refactor wait-for-user-interrupt function Oded Gabbay
2021-11-07 20:51 ` [PATCH 09/10] habanalabs: add new opcodes for INFO IOCTL Oded Gabbay
2021-11-07 20:51 ` [PATCH 10/10] habanalabs: make hdev creation code more readable Oded Gabbay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).