linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 01/12] habanalabs: fix soft reset accounting
@ 2021-11-28 19:34 Oded Gabbay
  2021-11-28 19:34 ` [PATCH 02/12] habanalabs: rename late init after reset function Oded Gabbay
                   ` (10 more replies)
  0 siblings, 11 replies; 12+ messages in thread
From: Oded Gabbay @ 2021-11-28 19:34 UTC (permalink / raw)
  To: linux-kernel

Reset upon device release is not a soft-reset from user/system point
of view. As such, we shouldn't count that reset in the statistics we
gather and expose to the monitoring applications.

We also shouldn't print soft-reset when doing the reset upon device
release.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 50 ++++++++++++-------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 484e0446381e..2b208007c26f 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -962,13 +962,13 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
  */
 int hl_device_reset(struct hl_device *hdev, u32 flags)
 {
-	bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false;
+	bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false,
+								reset_upon_device_release = false;
 	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
 	int i, rc;
 
 	if (!hdev->init_done) {
-		dev_err(hdev->dev,
-			"Can't reset before initialization is done\n");
+		dev_err(hdev->dev, "Can't reset before initialization is done\n");
 		return 0;
 	}
 
@@ -988,6 +988,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 			return -EINVAL;
 		}
 
+		reset_upon_device_release = true;
+
 		goto do_reset;
 	}
 
@@ -1024,12 +1026,10 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 
 		if (hard_reset)
 			dev_info(hdev->dev, "Going to reset device\n");
-		else if (flags & HL_DRV_RESET_DEV_RELEASE)
-			dev_info(hdev->dev,
-				"Going to reset device after it was released by user\n");
+		else if (reset_upon_device_release)
+			dev_info(hdev->dev, "Going to reset device after release by user\n");
 		else
-			dev_info(hdev->dev,
-				"Going to reset compute engines of inference device\n");
+			dev_info(hdev->dev, "Going to reset engines of inference device\n");
 	}
 
 again:
@@ -1174,16 +1174,14 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 
 	rc = hdev->asic_funcs->hw_init(hdev);
 	if (rc) {
-		dev_err(hdev->dev,
-			"failed to initialize the H/W after reset\n");
+		dev_err(hdev->dev, "failed to initialize the H/W after reset\n");
 		goto out_err;
 	}
 
 	/* If device is not idle fail the reset process */
 	if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
 			HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
-		dev_err(hdev->dev,
-			"device is not idle (mask 0x%llx_%llx) after reset\n",
+		dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx) after reset\n",
 			idle_mask[1], idle_mask[0]);
 		rc = -EIO;
 		goto out_err;
@@ -1192,23 +1190,20 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	/* Check that the communication with the device is working */
 	rc = hdev->asic_funcs->test_queues(hdev);
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to detect if device is alive after reset\n");
+		dev_err(hdev->dev, "Failed to detect if device is alive after reset\n");
 		goto out_err;
 	}
 
 	if (hard_reset) {
 		rc = device_late_init(hdev);
 		if (rc) {
-			dev_err(hdev->dev,
-				"Failed late init after hard reset\n");
+			dev_err(hdev->dev, "Failed late init after hard reset\n");
 			goto out_err;
 		}
 
 		rc = hl_vm_init(hdev);
 		if (rc) {
-			dev_err(hdev->dev,
-				"Failed to init memory module after hard reset\n");
+			dev_err(hdev->dev, "Failed to init memory module after hard reset\n");
 			goto out_err;
 		}
 
@@ -1216,8 +1211,11 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	} else {
 		rc = hdev->asic_funcs->soft_reset_late_init(hdev);
 		if (rc) {
-			dev_err(hdev->dev,
-				"Failed late init after soft reset\n");
+			if (reset_upon_device_release)
+				dev_err(hdev->dev,
+					"Failed late init in reset after device release\n");
+			else
+				dev_err(hdev->dev, "Failed late init after soft reset\n");
 			goto out_err;
 		}
 	}
@@ -1236,7 +1234,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		 * the device will be operational although it shouldn't be
 		 */
 		hdev->asic_funcs->enable_events_from_fw(hdev);
-	} else {
+	} else if (!reset_upon_device_release) {
 		hdev->soft_reset_cnt++;
 	}
 
@@ -1246,12 +1244,14 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	hdev->disabled = true;
 
 	if (hard_reset) {
-		dev_err(hdev->dev,
-			"Failed to reset! Device is NOT usable\n");
+		dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n");
 		hdev->hard_reset_cnt++;
+	} else if (reset_upon_device_release) {
+		dev_err(hdev->dev, "Failed to reset device after user release\n");
+		hard_reset = true;
+		goto again;
 	} else {
-		dev_err(hdev->dev,
-			"Failed to do soft-reset, trying hard reset\n");
+		dev_err(hdev->dev, "Failed to do soft-reset\n");
 		hdev->soft_reset_cnt++;
 		hard_reset = true;
 		goto again;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 02/12] habanalabs: rename late init after reset function
  2021-11-28 19:34 [PATCH 01/12] habanalabs: fix soft reset accounting Oded Gabbay
@ 2021-11-28 19:34 ` Oded Gabbay
  2021-11-28 19:34 ` [PATCH 03/12] habanalabs/gaudi: return EPERM on non hard-reset Oded Gabbay
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2021-11-28 19:34 UTC (permalink / raw)
  To: linux-kernel

The ASIC-specific soft_reset_late_init() is now called after either
soft-reset or reset-upon-device-release. Therefore, it needs a more
appropriate name.

No need to split it to two functions, as an ASIC either supports
soft-reset or reset-upon-device-release.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c     | 2 +-
 drivers/misc/habanalabs/common/habanalabs.h | 4 ++--
 drivers/misc/habanalabs/gaudi/gaudi.c       | 4 ++--
 drivers/misc/habanalabs/goya/goya.c         | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 2b208007c26f..822d9cec5aaf 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -1209,7 +1209,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 
 		hl_set_max_power(hdev);
 	} else {
-		rc = hdev->asic_funcs->soft_reset_late_init(hdev);
+		rc = hdev->asic_funcs->non_hard_reset_late_init(hdev);
 		if (rc) {
 			if (reset_upon_device_release)
 				dev_err(hdev->dev,
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 1a7f8d37f684..a465b4a5f31d 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1153,7 +1153,7 @@ struct fw_load_mgr {
  * @disable_clock_gating: disable clock gating completely
  * @debug_coresight: perform certain actions on Coresight for debugging.
  * @is_device_idle: return true if device is idle, false otherwise.
- * @soft_reset_late_init: perform certain actions needed after soft reset.
+ * @non_hard_reset_late_init: perform certain actions needed after a reset which is not hard-reset
  * @hw_queues_lock: acquire H/W queues lock.
  * @hw_queues_unlock: release H/W queues lock.
  * @get_pci_id: retrieve PCI ID.
@@ -1289,7 +1289,7 @@ struct hl_asic_funcs {
 	int (*debug_coresight)(struct hl_device *hdev, void *data);
 	bool (*is_device_idle)(struct hl_device *hdev, u64 *mask_arr,
 					u8 mask_len, struct seq_file *s);
-	int (*soft_reset_late_init)(struct hl_device *hdev);
+	int (*non_hard_reset_late_init)(struct hl_device *hdev);
 	void (*hw_queues_lock)(struct hl_device *hdev);
 	void (*hw_queues_unlock)(struct hl_device *hdev);
 	u32 (*get_pci_id)(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index f29afcca74fc..464d205a26ed 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7819,7 +7819,7 @@ static void gaudi_print_fw_alive_info(struct hl_device *hdev,
 		fw_alive->thread_id, fw_alive->uptime_seconds);
 }
 
-static int gaudi_soft_reset_late_init(struct hl_device *hdev)
+static int gaudi_non_hard_reset_late_init(struct hl_device *hdev)
 {
 	struct gaudi_device *gaudi = hdev->asic_specific;
 
@@ -9591,7 +9591,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
 	.disable_clock_gating = gaudi_disable_clock_gating,
 	.debug_coresight = gaudi_debug_coresight,
 	.is_device_idle = gaudi_is_device_idle,
-	.soft_reset_late_init = gaudi_soft_reset_late_init,
+	.non_hard_reset_late_init = gaudi_non_hard_reset_late_init,
 	.hw_queues_lock = gaudi_hw_queues_lock,
 	.hw_queues_unlock = gaudi_hw_queues_unlock,
 	.get_pci_id = gaudi_get_pci_id,
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index bbee6739ce87..e54d60e75854 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -4813,7 +4813,7 @@ static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr,
 	return rc;
 }
 
-static int goya_soft_reset_late_init(struct hl_device *hdev)
+static int goya_non_hard_reset_late_init(struct hl_device *hdev)
 {
 	/*
 	 * Unmask all IRQs since some could have been received
@@ -5738,7 +5738,7 @@ static const struct hl_asic_funcs goya_funcs = {
 	.disable_clock_gating = goya_disable_clock_gating,
 	.debug_coresight = goya_debug_coresight,
 	.is_device_idle = goya_is_device_idle,
-	.soft_reset_late_init = goya_soft_reset_late_init,
+	.non_hard_reset_late_init = goya_non_hard_reset_late_init,
 	.hw_queues_lock = goya_hw_queues_lock,
 	.hw_queues_unlock = goya_hw_queues_unlock,
 	.get_pci_id = goya_get_pci_id,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 03/12] habanalabs/gaudi: return EPERM on non hard-reset
  2021-11-28 19:34 [PATCH 01/12] habanalabs: fix soft reset accounting Oded Gabbay
  2021-11-28 19:34 ` [PATCH 02/12] habanalabs: rename late init after reset function Oded Gabbay
@ 2021-11-28 19:34 ` Oded Gabbay
  2021-11-28 19:34 ` [PATCH 04/12] habanalabs: move device boot warnings to the correct location Oded Gabbay
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2021-11-28 19:34 UTC (permalink / raw)
  To: linux-kernel

GAUDI supports only hard-reset. Therefore, this function should
return an error of operation not permitted.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 464d205a26ed..07e03d44930e 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7821,12 +7821,8 @@ static void gaudi_print_fw_alive_info(struct hl_device *hdev,
 
 static int gaudi_non_hard_reset_late_init(struct hl_device *hdev)
 {
-	struct gaudi_device *gaudi = hdev->asic_specific;
-
-	/* Unmask all IRQs since some could have been received
-	 * during the soft reset
-	 */
-	return hl_fw_unmask_irq_arr(hdev, gaudi->events, sizeof(gaudi->events));
+	/* GAUDI doesn't support any reset except hard-reset */
+	return -EPERM;
 }
 
 static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 04/12] habanalabs: move device boot warnings to the correct location
  2021-11-28 19:34 [PATCH 01/12] habanalabs: fix soft reset accounting Oded Gabbay
  2021-11-28 19:34 ` [PATCH 02/12] habanalabs: rename late init after reset function Oded Gabbay
  2021-11-28 19:34 ` [PATCH 03/12] habanalabs/gaudi: return EPERM on non hard-reset Oded Gabbay
@ 2021-11-28 19:34 ` Oded Gabbay
  2021-11-28 19:34 ` [PATCH 05/12] habanalabs: fix race condition in multi CS completion Oded Gabbay
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2021-11-28 19:34 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

As device boot warnings clears the indication from the error mask,
they must be located together before the unknown error validation.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c | 45 ++++++++++----------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index aea5904332fd..cf67800f2b47 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -443,15 +443,6 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
 		err_exists = true;
 	}
 
-	if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) {
-		dev_warn(hdev->dev,
-			"Device boot warning - Skipped DRAM initialization\n");
-		/* This is a warning so we don't want it to disable the
-		 * device
-		 */
-		err_val &= ~CPU_BOOT_ERR0_DRAM_SKIPPED;
-	}
-
 	if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) {
 		if (hdev->bmc_enable) {
 			dev_err(hdev->dev,
@@ -495,15 +486,6 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
 		err_exists = true;
 	}
 
-	if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) {
-		dev_warn(hdev->dev,
-			"Device boot warning - Failed to load preboot primary image\n");
-		/* This is a warning so we don't want it to disable the
-		 * device as we have a secondary preboot image
-		 */
-		err_val &= ~CPU_BOOT_ERR0_PRI_IMG_VER_FAIL;
-	}
-
 	if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL) {
 		dev_err(hdev->dev, "Device boot error - Failed to load preboot secondary image\n");
 		err_exists = true;
@@ -523,10 +505,23 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
 	if (sts_val & CPU_BOOT_DEV_STS0_ENABLED)
 		dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val);
 
-	if (!err_exists && (err_val & ~CPU_BOOT_ERR0_ENABLED)) {
-		dev_err(hdev->dev,
-			"Device boot error - unknown ERR0 error 0x%08x\n", err_val);
-		err_exists = true;
+	/* All warnings should go here in order not to reach the unknown error validation */
+	if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) {
+		dev_warn(hdev->dev,
+			"Device boot warning - Skipped DRAM initialization\n");
+		/* This is a warning so we don't want it to disable the
+		 * device
+		 */
+		err_val &= ~CPU_BOOT_ERR0_DRAM_SKIPPED;
+	}
+
+	if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) {
+		dev_warn(hdev->dev,
+			"Device boot warning - Failed to load preboot primary image\n");
+		/* This is a warning so we don't want it to disable the
+		 * device as we have a secondary preboot image
+		 */
+		err_val &= ~CPU_BOOT_ERR0_PRI_IMG_VER_FAIL;
 	}
 
 	if (err_val & CPU_BOOT_ERR0_TPM_FAIL) {
@@ -538,6 +533,12 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
 		err_val &= ~CPU_BOOT_ERR0_TPM_FAIL;
 	}
 
+	if (!err_exists && (err_val & ~CPU_BOOT_ERR0_ENABLED)) {
+		dev_err(hdev->dev,
+			"Device boot error - unknown ERR0 error 0x%08x\n", err_val);
+		err_exists = true;
+	}
+
 	/* return error only if it's in the predefined mask */
 	if (err_exists && ((err_val & ~CPU_BOOT_ERR0_ENABLED) &
 				lower_32_bits(hdev->boot_error_status_mask)))
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 05/12] habanalabs: fix race condition in multi CS completion
  2021-11-28 19:34 [PATCH 01/12] habanalabs: fix soft reset accounting Oded Gabbay
                   ` (2 preceding siblings ...)
  2021-11-28 19:34 ` [PATCH 04/12] habanalabs: move device boot warnings to the correct location Oded Gabbay
@ 2021-11-28 19:34 ` Oded Gabbay
  2021-11-28 19:34 ` [PATCH 06/12] habanalabs: add more info ioctls support during reset Oded Gabbay
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2021-11-28 19:34 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

Race example scenario:
1. User have 2 threads that waits on multi CS:
   - thread_0 waits on QID 0 and uses multi CS context 0.
   - thread_1 waits on QID 1 and uses multi CS context 1.
2. thread_1 got completion and release multi CS context 1.
3. CS related to multi CS of thread_0 starts executing
   complete_multi_cs function, the first iteration of the loop
   completes the multi CS of thread_0, hence multi CS context 0
   is released.
4. thread_1 waits on QID 1 and uses multi CS context 0.
5. thread_0 waits on QID 0 and uses multi CS context 1.
6. The second iterattion of the loop (from step 3) starts, which
   means, start checking multi CS context 1:
   - multi CS contetxt is being used by thread_0 waiting on QID 0.
   - The fence of the CS (still CS from step 3) has QID map the same
     as the multi CS context 1.
   - multi CS context 1 (thread_0) gets completion on CS that triggered
     already thread_0 (with multi CS context 0) and is no longer
     being waited on.

Fixed by exiting the loop in complete_multi_cs after getting completion

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_submission.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index c1fd4ba14c60..4e893364a3cc 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -545,6 +545,13 @@ static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
 			 * mcs fences.
 			 */
 			fence->mcs_handling_done = true;
+			/*
+			 * Since CS (and its related fence) can be associated with only one
+			 * multi CS context, once it triggered multi CS completion no need to
+			 * continue checking other multi CS contexts.
+			 */
+			spin_unlock(&mcs_compl->lock);
+			break;
 		}
 
 		spin_unlock(&mcs_compl->lock);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 06/12] habanalabs: add more info ioctls support during reset
  2021-11-28 19:34 [PATCH 01/12] habanalabs: fix soft reset accounting Oded Gabbay
                   ` (3 preceding siblings ...)
  2021-11-28 19:34 ` [PATCH 05/12] habanalabs: fix race condition in multi CS completion Oded Gabbay
@ 2021-11-28 19:34 ` Oded Gabbay
  2021-11-28 19:34 ` [PATCH 07/12] habanalabs: add power information type to POWER_GET packet Oded Gabbay
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2021-11-28 19:34 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Some info ioctls can be served even if the device is disabled or
in reset. Hence, we enable more info ioctls during reset, as these
ioctls do not require any H/W nor F/W communication.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../misc/habanalabs/common/habanalabs_ioctl.c | 55 +++++++++----------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index 360a1e9bbd5d..15797d55b4e8 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -614,6 +614,33 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_RESET_COUNT:
 		return get_reset_count(hdev, args);
 
+	case HL_INFO_HW_EVENTS:
+		return hw_events_info(hdev, false, args);
+
+	case HL_INFO_HW_EVENTS_AGGREGATE:
+		return hw_events_info(hdev, true, args);
+
+	case HL_INFO_CS_COUNTERS:
+		return cs_counters_info(hpriv, args);
+
+	case HL_INFO_CLK_THROTTLE_REASON:
+		return clk_throttle_info(hpriv, args);
+
+	case HL_INFO_SYNC_MANAGER:
+		return sync_manager_info(hpriv, args);
+
+	case HL_INFO_OPEN_STATS:
+		return open_stats_info(hpriv, args);
+
+	case HL_INFO_LAST_ERR_OPEN_DEV_TIME:
+		return last_err_open_dev_info(hpriv, args);
+
+	case HL_INFO_CS_TIMEOUT_EVENT:
+		return cs_timeout_info(hpriv, args);
+
+	case HL_INFO_RAZWI_EVENT:
+		return razwi_info(hpriv, args);
+
 	default:
 		break;
 	}
@@ -626,10 +653,6 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	}
 
 	switch (args->op) {
-	case HL_INFO_HW_EVENTS:
-		rc = hw_events_info(hdev, false, args);
-		break;
-
 	case HL_INFO_DRAM_USAGE:
 		rc = dram_usage_info(hpriv, args);
 		break;
@@ -642,10 +665,6 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 		rc = device_utilization(hdev, args);
 		break;
 
-	case HL_INFO_HW_EVENTS_AGGREGATE:
-		rc = hw_events_info(hdev, true, args);
-		break;
-
 	case HL_INFO_CLK_RATE:
 		rc = get_clk_rate(hdev, args);
 		break;
@@ -653,18 +672,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_TIME_SYNC:
 		return time_sync_info(hdev, args);
 
-	case HL_INFO_CS_COUNTERS:
-		return cs_counters_info(hpriv, args);
-
 	case HL_INFO_PCI_COUNTERS:
 		return pci_counters_info(hpriv, args);
 
-	case HL_INFO_CLK_THROTTLE_REASON:
-		return clk_throttle_info(hpriv, args);
-
-	case HL_INFO_SYNC_MANAGER:
-		return sync_manager_info(hpriv, args);
-
 	case HL_INFO_TOTAL_ENERGY:
 		return total_energy_consumption_info(hpriv, args);
 
@@ -674,8 +684,6 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_POWER:
 		return power_info(hpriv, args);
 
-	case HL_INFO_OPEN_STATS:
-		return open_stats_info(hpriv, args);
 
 	case HL_INFO_DRAM_REPLACED_ROWS:
 		return dram_replaced_rows_info(hpriv, args);
@@ -683,15 +691,6 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_DRAM_PENDING_ROWS:
 		return dram_pending_rows_info(hpriv, args);
 
-	case HL_INFO_LAST_ERR_OPEN_DEV_TIME:
-		return last_err_open_dev_info(hpriv, args);
-
-	case HL_INFO_CS_TIMEOUT_EVENT:
-		return cs_timeout_info(hpriv, args);
-
-	case HL_INFO_RAZWI_EVENT:
-		return razwi_info(hpriv, args);
-
 	default:
 		dev_err(dev, "Invalid request %d\n", args->op);
 		rc = -ENOTTY;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 07/12] habanalabs: add power information type to POWER_GET packet
  2021-11-28 19:34 [PATCH 01/12] habanalabs: fix soft reset accounting Oded Gabbay
                   ` (4 preceding siblings ...)
  2021-11-28 19:34 ` [PATCH 06/12] habanalabs: add more info ioctls support during reset Oded Gabbay
@ 2021-11-28 19:34 ` Oded Gabbay
  2021-11-28 19:34 ` [PATCH 08/12] habanalabs: change misleading IRQ warning during reset Oded Gabbay
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2021-11-28 19:34 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

In new f/w versions, it is required to explicitly indicate the power
information type when querying the F/W for power info.
When getting the current power level it should be set to power_input.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c | 1 +
 include/uapi/misc/habanalabs.h               | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index cf67800f2b47..bafadcc65497 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -969,6 +969,7 @@ int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power)
 
 	pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_GET <<
 				CPUCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.type = CPUCP_POWER_INPUT;
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 			HL_CPUCP_INFO_TIMEOUT_USEC, &result);
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index eb8565fdae70..cd86937c572d 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -333,6 +333,7 @@ enum hl_server_type {
  * HL_INFO_SYNC_MANAGER  - Retrieve sync manager info per dcore
  * HL_INFO_TOTAL_ENERGY  - Retrieve total energy consumption
  * HL_INFO_PLL_FREQUENCY - Retrieve PLL frequency
+ * HL_INFO_POWER         - Retrieve power information
  * HL_INFO_OPEN_STATS    - Retrieve info regarding recent device open calls
  * HL_INFO_DRAM_REPLACED_ROWS - Retrieve DRAM replaced rows info
  * HL_INFO_DRAM_PENDING_ROWS - Retrieve DRAM pending rows num
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 08/12] habanalabs: change misleading IRQ warning during reset
  2021-11-28 19:34 [PATCH 01/12] habanalabs: fix soft reset accounting Oded Gabbay
                   ` (5 preceding siblings ...)
  2021-11-28 19:34 ` [PATCH 07/12] habanalabs: add power information type to POWER_GET packet Oded Gabbay
@ 2021-11-28 19:34 ` Oded Gabbay
  2021-11-28 19:34 ` [PATCH 09/12] habanalabs: handle events during soft-reset Oded Gabbay
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2021-11-28 19:34 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Currently we dump the physical IRQ line index in host if an event
is received during reset. This ID is confusing as it means nothing
to the user.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/irq.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/common/irq.c b/drivers/misc/habanalabs/common/irq.c
index 96d82b682674..9fd4c18e274e 100644
--- a/drivers/misc/habanalabs/common/irq.c
+++ b/drivers/misc/habanalabs/common/irq.c
@@ -246,9 +246,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg)
 		dma_rmb();
 
 		if (hdev->disabled) {
-			dev_warn(hdev->dev,
-				"Device disabled but received IRQ %d for EQ\n",
-					irq);
+			dev_warn(hdev->dev, "Device disabled but received an EQ event\n");
 			goto skip_irq;
 		}
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 09/12] habanalabs: handle events during soft-reset
  2021-11-28 19:34 [PATCH 01/12] habanalabs: fix soft reset accounting Oded Gabbay
                   ` (6 preceding siblings ...)
  2021-11-28 19:34 ` [PATCH 08/12] habanalabs: change misleading IRQ warning during reset Oded Gabbay
@ 2021-11-28 19:34 ` Oded Gabbay
  2021-11-28 19:34 ` [PATCH 10/12] habanalabs: skip read fw errors if dynamic descriptor invalid Oded Gabbay
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2021-11-28 19:34 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Driver should handle events during soft-reset as F/W is not
going through reset and it keeps sending events towards host.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c     | 4 ++++
 drivers/misc/habanalabs/common/habanalabs.h | 2 ++
 drivers/misc/habanalabs/common/irq.c        | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 822d9cec5aaf..720eea0b7e9c 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -1019,6 +1019,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 
 		handle_reset_trigger(hdev, flags);
 
+		hdev->is_in_soft_reset = !hard_reset;
+
 		/* This also blocks future CS/VM/JOB completion operations */
 		hdev->disabled = true;
 
@@ -1171,6 +1173,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	 * is required for the initialization itself
 	 */
 	hdev->disabled = false;
+	hdev->is_in_soft_reset = false;
 
 	rc = hdev->asic_funcs->hw_init(hdev);
 	if (rc) {
@@ -1242,6 +1245,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 
 out_err:
 	hdev->disabled = true;
+	hdev->is_in_soft_reset = false;
 
 	if (hard_reset) {
 		dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n");
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index a465b4a5f31d..c2129c9fe9e4 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2591,6 +2591,7 @@ struct last_error_session_info {
  *                        protocol will throw an error. Relevant only for
  *                        cases where Linux was not loaded to device CPU
  * @supports_wait_for_multi_cs: true if wait for multi CS is supported
+ * @is_in_soft_reset: Device is currently in soft reset process.
  */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -2719,6 +2720,7 @@ struct hl_device {
 	u8				device_cpu_is_halted;
 	u8				supports_wait_for_multi_cs;
 	u8				stream_master_qid_arr_size;
+	u8				is_in_soft_reset;
 
 	/* Parameters for bring-up */
 	u64				nic_ports_mask;
diff --git a/drivers/misc/habanalabs/common/irq.c b/drivers/misc/habanalabs/common/irq.c
index 9fd4c18e274e..64e0d9de21bd 100644
--- a/drivers/misc/habanalabs/common/irq.c
+++ b/drivers/misc/habanalabs/common/irq.c
@@ -245,7 +245,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg)
 		 */
 		dma_rmb();
 
-		if (hdev->disabled) {
+		if (hdev->disabled && !hdev->is_in_soft_reset) {
 			dev_warn(hdev->dev, "Device disabled but received an EQ event\n");
 			goto skip_irq;
 		}
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 10/12] habanalabs: skip read fw errors if dynamic descriptor invalid
  2021-11-28 19:34 [PATCH 01/12] habanalabs: fix soft reset accounting Oded Gabbay
                   ` (7 preceding siblings ...)
  2021-11-28 19:34 ` [PATCH 09/12] habanalabs: handle events during soft-reset Oded Gabbay
@ 2021-11-28 19:34 ` Oded Gabbay
  2021-11-28 19:34 ` [PATCH 11/12] habanalabs: add SOB information to signal submission uAPI Oded Gabbay
  2021-11-28 19:34 ` [PATCH 12/12] habanalabs: enable access to info ioctl during hard reset Oded Gabbay
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2021-11-28 19:34 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ohad Sharabi

From: Ohad Sharabi <osharabi@habana.ai>

Reporting FW errors involves reading of the error registers.

In case we have a corrupted FW descriptor we cannot do that since the
dynamic scratchpad is potentially corrupted as well and may cause kernel
crush when attempting access to a corrupted register offset.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c | 17 +++++++++++++++--
 drivers/misc/habanalabs/common/habanalabs.h  |  2 ++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index bafadcc65497..760ca139d5cf 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -1772,6 +1772,9 @@ static int hl_fw_dynamic_validate_descriptor(struct hl_device *hdev,
 		return rc;
 	}
 
+	/* here we can mark the descriptor as valid as the content has been validated */
+	fw_loader->dynamic_loader.fw_desc_valid = true;
+
 	return 0;
 }
 
@@ -1828,7 +1831,13 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev,
 		return rc;
 	}
 
-	/* extract address copy the descriptor from */
+	/*
+	 * extract address to copy the descriptor from
+	 * in addition, as the descriptor value is going to be over-ridden by new data- we mark it
+	 * as invalid.
+	 * it will be marked again as valid once validated
+	 */
+	fw_loader->dynamic_loader.fw_desc_valid = false;
 	src = hdev->pcie_bar[region->bar_id] + region->offset_in_bar +
 							response->ram_offset;
 	memcpy_fromio(fw_desc, src, sizeof(struct lkd_fw_comms_desc));
@@ -2317,6 +2326,9 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 	dev_info(hdev->dev,
 		"Loading firmware to device, may take some time...\n");
 
+	/* initialize FW descriptor as invalid */
+	fw_loader->dynamic_loader.fw_desc_valid = false;
+
 	/*
 	 * In this stage, "cpu_dyn_regs" contains only LKD's hard coded values!
 	 * It will be updated from FW after hl_fw_dynamic_request_descriptor().
@@ -2412,7 +2424,8 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 	return 0;
 
 protocol_err:
-	fw_read_errors(hdev, le32_to_cpu(dyn_regs->cpu_boot_err0),
+	if (fw_loader->dynamic_loader.fw_desc_valid)
+		fw_read_errors(hdev, le32_to_cpu(dyn_regs->cpu_boot_err0),
 				le32_to_cpu(dyn_regs->cpu_boot_err1),
 				le32_to_cpu(dyn_regs->cpu_boot_dev_sts0),
 				le32_to_cpu(dyn_regs->cpu_boot_dev_sts1));
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index c2129c9fe9e4..77ac4bb98137 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1034,6 +1034,7 @@ struct fw_response {
  * @image_region: region to copy the FW image to
  * @fw_image_size: size of FW image to load
  * @wait_for_bl_timeout: timeout for waiting for boot loader to respond
+ * @fw_desc_valid: true if FW descriptor has been validated and hence the data can be used
  */
 struct dynamic_fw_load_mgr {
 	struct fw_response response;
@@ -1041,6 +1042,7 @@ struct dynamic_fw_load_mgr {
 	struct pci_mem_region *image_region;
 	size_t fw_image_size;
 	u32 wait_for_bl_timeout;
+	bool fw_desc_valid;
 };
 
 /**
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 11/12] habanalabs: add SOB information to signal submission uAPI
  2021-11-28 19:34 [PATCH 01/12] habanalabs: fix soft reset accounting Oded Gabbay
                   ` (8 preceding siblings ...)
  2021-11-28 19:34 ` [PATCH 10/12] habanalabs: skip read fw errors if dynamic descriptor invalid Oded Gabbay
@ 2021-11-28 19:34 ` Oded Gabbay
  2021-11-28 19:34 ` [PATCH 12/12] habanalabs: enable access to info ioctl during hard reset Oded Gabbay
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2021-11-28 19:34 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

For debug purpose, add SOB address and SOB initial counter value
before current submission to uAPI output.

Using SOB address and initial counter, user can calculate how much of
the submmision has been completed.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 37 +++++++++++++++----
 drivers/misc/habanalabs/common/habanalabs.h   |  5 +++
 drivers/misc/habanalabs/common/hw_queue.c     |  3 ++
 include/uapi/misc/habanalabs.h                | 10 ++++-
 4 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 4e893364a3cc..7a277f442207 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -1277,7 +1277,8 @@ static u32 get_stream_master_qid_mask(struct hl_device *hdev, u32 qid)
 
 static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 				u32 num_chunks, u64 *cs_seq, u32 flags,
-				u32 encaps_signals_handle, u32 timeout)
+				u32 encaps_signals_handle, u32 timeout,
+				u16 *signal_initial_sob_count)
 {
 	bool staged_mid, int_queues_only = true;
 	struct hl_device *hdev = hpriv->hdev;
@@ -1444,6 +1445,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 		goto free_cs_object;
 	}
 
+	*signal_initial_sob_count = cs->initial_sob_count;
+
 	rc = HL_CS_STATUS_SUCCESS;
 	goto put_cs;
 
@@ -1472,6 +1475,7 @@ static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
 	int rc = 0, do_ctx_switch;
 	void __user *chunks;
 	u32 num_chunks, tmp;
+	u16 sob_count;
 	int ret;
 
 	do_ctx_switch = atomic_cmpxchg(&ctx->thread_ctx_switch_token, 1, 0);
@@ -1512,7 +1516,7 @@ static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
 			rc = 0;
 		} else {
 			rc = cs_ioctl_default(hpriv, chunks, num_chunks,
-					cs_seq, 0, 0, hdev->timeout_jiffies);
+					cs_seq, 0, 0, hdev->timeout_jiffies, &sob_count);
 		}
 
 		mutex_unlock(&hpriv->restore_phase_mutex);
@@ -1963,7 +1967,8 @@ static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id)
 
 static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 				void __user *chunks, u32 num_chunks,
-				u64 *cs_seq, u32 flags, u32 timeout)
+				u64 *cs_seq, u32 flags, u32 timeout,
+				u32 *signal_sob_addr_offset, u16 *signal_initial_sob_count)
 {
 	struct hl_cs_encaps_sig_handle *encaps_sig_hdl = NULL;
 	bool handle_found = false, is_wait_cs = false,
@@ -2195,6 +2200,9 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 		goto free_cs_object;
 	}
 
+	*signal_sob_addr_offset = cs->sob_addr_offset;
+	*signal_initial_sob_count = cs->initial_sob_count;
+
 	rc = HL_CS_STATUS_SUCCESS;
 	if (is_wait_cs)
 		wait_cs_submitted = true;
@@ -2225,6 +2233,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 	void __user *chunks;
 	u32 num_chunks, flags, timeout,
 		signals_count = 0, sob_addr = 0, handle_id = 0;
+	u16 sob_initial_count = 0;
 	int rc;
 
 	rc = hl_cs_sanity_checks(hpriv, args);
@@ -2255,7 +2264,8 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 	case CS_TYPE_WAIT:
 	case CS_TYPE_COLLECTIVE_WAIT:
 		rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks, num_chunks,
-					&cs_seq, args->in.cs_flags, timeout);
+					&cs_seq, args->in.cs_flags, timeout,
+					&sob_addr, &sob_initial_count);
 		break;
 	case CS_RESERVE_SIGNALS:
 		rc = cs_ioctl_reserve_signals(hpriv,
@@ -2271,20 +2281,33 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 		rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq,
 						args->in.cs_flags,
 						args->in.encaps_sig_handle_id,
-						timeout);
+						timeout, &sob_initial_count);
 		break;
 	}
 out:
 	if (rc != -EAGAIN) {
 		memset(args, 0, sizeof(*args));
 
-		if (cs_type == CS_RESERVE_SIGNALS) {
+		switch (cs_type) {
+		case CS_RESERVE_SIGNALS:
 			args->out.handle_id = handle_id;
 			args->out.sob_base_addr_offset = sob_addr;
 			args->out.count = signals_count;
-		} else {
+			break;
+		case CS_TYPE_SIGNAL:
+			args->out.sob_base_addr_offset = sob_addr;
+			args->out.sob_count_before_submission = sob_initial_count;
+			args->out.seq = cs_seq;
+			break;
+		case CS_TYPE_DEFAULT:
+			args->out.sob_count_before_submission = sob_initial_count;
+			args->out.seq = cs_seq;
+			break;
+		default:
 			args->out.seq = cs_seq;
+			break;
 		}
+
 		args->out.status = rc;
 	}
 
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 77ac4bb98137..93d0a85265be 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1545,6 +1545,9 @@ struct hl_userptr {
  * @submission_time_jiffies: submission time of the cs
  * @type: CS_TYPE_*.
  * @encaps_sig_hdl_id: encaps signals handle id, set for the first staged cs.
+ * @sob_addr_offset: sob offset from the configuration base address.
+ * @initial_sob_count: count of completed signals in SOB before current submission of signal or
+ *                     cs with encaps signals.
  * @submitted: true if CS was submitted to H/W.
  * @completed: true if CS was completed by device.
  * @timedout : true if CS was timedout.
@@ -1580,6 +1583,8 @@ struct hl_cs {
 	u64			submission_time_jiffies;
 	enum hl_cs_type		type;
 	u32			encaps_sig_hdl_id;
+	u32			sob_addr_offset;
+	u16			initial_sob_count;
 	u8			submitted;
 	u8			completed;
 	u8			timedout;
diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c
index 0743319b10c7..fc841d651210 100644
--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -429,6 +429,9 @@ static int init_signal_cs(struct hl_device *hdev,
 	rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, 1,
 								false);
 
+	job->cs->sob_addr_offset = hw_sob->sob_addr;
+	job->cs->initial_sob_count = prop->next_sob_val - 1;
+
 	return rc;
 }
 
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index cd86937c572d..648850b954a3 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -929,9 +929,17 @@ struct hl_cs_out {
 
 	/*
 	 * SOB base address offset
-	 * Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY is set
+	 * Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY or HL_CS_FLAGS_SIGNAL is set
 	 */
 	__u32 sob_base_addr_offset;
+
+	/*
+	 * Count of completed signals in SOB before current signal submission.
+	 * Valid only when (HL_CS_FLAGS_ENCAP_SIGNALS & HL_CS_FLAGS_STAGED_SUBMISSION)
+	 * or HL_CS_FLAGS_SIGNAL is set
+	 */
+	__u16 sob_count_before_submission;
+	__u16 pad[3];
 };
 
 union hl_cs_args {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 12/12] habanalabs: enable access to info ioctl during hard reset
  2021-11-28 19:34 [PATCH 01/12] habanalabs: fix soft reset accounting Oded Gabbay
                   ` (9 preceding siblings ...)
  2021-11-28 19:34 ` [PATCH 11/12] habanalabs: add SOB information to signal submission uAPI Oded Gabbay
@ 2021-11-28 19:34 ` Oded Gabbay
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2021-11-28 19:34 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

Because info ioctl is used to retrieve data, some of its opcodes may be
used during hard reset.
Other ioctls should be blocked while device is not operational.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_submission.c | 5 +----
 drivers/misc/habanalabs/common/habanalabs_ioctl.c   | 7 -------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 7a277f442207..8be547b0926c 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -1146,9 +1146,6 @@ static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
 	enum hl_cs_type cs_type;
 
 	if (!hl_device_operational(hdev, &status)) {
-		dev_warn_ratelimited(hdev->dev,
-			"Device is %s. Can't submit new CS\n",
-			hdev->status[status]);
 		return -EBUSY;
 	}
 
@@ -2997,7 +2994,7 @@ int hl_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 	 * user interrupt
 	 */
 	if (!hl_device_operational(hpriv->hdev, NULL))
-		return -EPERM;
+		return -EBUSY;
 
 	if (flags & HL_WAIT_CS_FLAGS_INTERRUPT)
 		rc = hl_interrupt_wait_ioctl(hpriv, data);
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index 15797d55b4e8..6c7339978bae 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -774,7 +774,6 @@ static long _hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg,
 		const struct hl_ioctl_desc *ioctl, struct device *dev)
 {
 	struct hl_fpriv *hpriv = filep->private_data;
-	struct hl_device *hdev = hpriv->hdev;
 	unsigned int nr = _IOC_NR(cmd);
 	char stack_kdata[128] = {0};
 	char *kdata = NULL;
@@ -783,12 +782,6 @@ static long _hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg,
 	u32 hl_size;
 	int retcode;
 
-	if (hdev->hard_reset_pending) {
-		dev_crit_ratelimited(dev,
-			"Device HARD reset pending! Please close FD\n");
-		return -ENODEV;
-	}
-
 	/* Do not trust userspace, use our own definition */
 	func = ioctl->func;
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2021-11-28 19:37 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-28 19:34 [PATCH 01/12] habanalabs: fix soft reset accounting Oded Gabbay
2021-11-28 19:34 ` [PATCH 02/12] habanalabs: rename late init after reset function Oded Gabbay
2021-11-28 19:34 ` [PATCH 03/12] habanalabs/gaudi: return EPERM on non hard-reset Oded Gabbay
2021-11-28 19:34 ` [PATCH 04/12] habanalabs: move device boot warnings to the correct location Oded Gabbay
2021-11-28 19:34 ` [PATCH 05/12] habanalabs: fix race condition in multi CS completion Oded Gabbay
2021-11-28 19:34 ` [PATCH 06/12] habanalabs: add more info ioctls support during reset Oded Gabbay
2021-11-28 19:34 ` [PATCH 07/12] habanalabs: add power information type to POWER_GET packet Oded Gabbay
2021-11-28 19:34 ` [PATCH 08/12] habanalabs: change misleading IRQ warning during reset Oded Gabbay
2021-11-28 19:34 ` [PATCH 09/12] habanalabs: handle events during soft-reset Oded Gabbay
2021-11-28 19:34 ` [PATCH 10/12] habanalabs: skip read fw errors if dynamic descriptor invalid Oded Gabbay
2021-11-28 19:34 ` [PATCH 11/12] habanalabs: add SOB information to signal submission uAPI Oded Gabbay
2021-11-28 19:34 ` [PATCH 12/12] habanalabs: enable access to info ioctl during hard reset Oded Gabbay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).