linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 01/12] habanalabs: fix soft reset accounting
@ 2021-11-28 19:34 Oded Gabbay
  2021-11-28 19:34 ` [PATCH 02/12] habanalabs: rename late init after reset function Oded Gabbay
                   ` (10 more replies)
  0 siblings, 11 replies; 12+ messages in thread
From: Oded Gabbay @ 2021-11-28 19:34 UTC (permalink / raw)
  To: linux-kernel

Reset upon device release is not a soft-reset from user/system point
of view. As such, we shouldn't count that reset in the statistics we
gather and expose to the monitoring applications.

We also shouldn't print soft-reset when doing the reset upon device
release.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 50 ++++++++++++-------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 484e0446381e..2b208007c26f 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -962,13 +962,13 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
  */
 int hl_device_reset(struct hl_device *hdev, u32 flags)
 {
-	bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false;
+	bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false,
+								reset_upon_device_release = false;
 	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
 	int i, rc;
 
 	if (!hdev->init_done) {
-		dev_err(hdev->dev,
-			"Can't reset before initialization is done\n");
+		dev_err(hdev->dev, "Can't reset before initialization is done\n");
 		return 0;
 	}
 
@@ -988,6 +988,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 			return -EINVAL;
 		}
 
+		reset_upon_device_release = true;
+
 		goto do_reset;
 	}
 
@@ -1024,12 +1026,10 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 
 		if (hard_reset)
 			dev_info(hdev->dev, "Going to reset device\n");
-		else if (flags & HL_DRV_RESET_DEV_RELEASE)
-			dev_info(hdev->dev,
-				"Going to reset device after it was released by user\n");
+		else if (reset_upon_device_release)
+			dev_info(hdev->dev, "Going to reset device after release by user\n");
 		else
-			dev_info(hdev->dev,
-				"Going to reset compute engines of inference device\n");
+			dev_info(hdev->dev, "Going to reset engines of inference device\n");
 	}
 
 again:
@@ -1174,16 +1174,14 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 
 	rc = hdev->asic_funcs->hw_init(hdev);
 	if (rc) {
-		dev_err(hdev->dev,
-			"failed to initialize the H/W after reset\n");
+		dev_err(hdev->dev, "failed to initialize the H/W after reset\n");
 		goto out_err;
 	}
 
 	/* If device is not idle fail the reset process */
 	if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
 			HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
-		dev_err(hdev->dev,
-			"device is not idle (mask 0x%llx_%llx) after reset\n",
+		dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx) after reset\n",
 			idle_mask[1], idle_mask[0]);
 		rc = -EIO;
 		goto out_err;
@@ -1192,23 +1190,20 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	/* Check that the communication with the device is working */
 	rc = hdev->asic_funcs->test_queues(hdev);
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to detect if device is alive after reset\n");
+		dev_err(hdev->dev, "Failed to detect if device is alive after reset\n");
 		goto out_err;
 	}
 
 	if (hard_reset) {
 		rc = device_late_init(hdev);
 		if (rc) {
-			dev_err(hdev->dev,
-				"Failed late init after hard reset\n");
+			dev_err(hdev->dev, "Failed late init after hard reset\n");
 			goto out_err;
 		}
 
 		rc = hl_vm_init(hdev);
 		if (rc) {
-			dev_err(hdev->dev,
-				"Failed to init memory module after hard reset\n");
+			dev_err(hdev->dev, "Failed to init memory module after hard reset\n");
 			goto out_err;
 		}
 
@@ -1216,8 +1211,11 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	} else {
 		rc = hdev->asic_funcs->soft_reset_late_init(hdev);
 		if (rc) {
-			dev_err(hdev->dev,
-				"Failed late init after soft reset\n");
+			if (reset_upon_device_release)
+				dev_err(hdev->dev,
+					"Failed late init in reset after device release\n");
+			else
+				dev_err(hdev->dev, "Failed late init after soft reset\n");
 			goto out_err;
 		}
 	}
@@ -1236,7 +1234,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		 * the device will be operational although it shouldn't be
 		 */
 		hdev->asic_funcs->enable_events_from_fw(hdev);
-	} else {
+	} else if (!reset_upon_device_release) {
 		hdev->soft_reset_cnt++;
 	}
 
@@ -1246,12 +1244,14 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	hdev->disabled = true;
 
 	if (hard_reset) {
-		dev_err(hdev->dev,
-			"Failed to reset! Device is NOT usable\n");
+		dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n");
 		hdev->hard_reset_cnt++;
+	} else if (reset_upon_device_release) {
+		dev_err(hdev->dev, "Failed to reset device after user release\n");
+		hard_reset = true;
+		goto again;
 	} else {
-		dev_err(hdev->dev,
-			"Failed to do soft-reset, trying hard reset\n");
+		dev_err(hdev->dev, "Failed to do soft-reset\n");
 		hdev->soft_reset_cnt++;
 		hard_reset = true;
 		goto again;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2021-11-28 19:37 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-28 19:34 [PATCH 01/12] habanalabs: fix soft reset accounting Oded Gabbay
2021-11-28 19:34 ` [PATCH 02/12] habanalabs: rename late init after reset function Oded Gabbay
2021-11-28 19:34 ` [PATCH 03/12] habanalabs/gaudi: return EPERM on non hard-reset Oded Gabbay
2021-11-28 19:34 ` [PATCH 04/12] habanalabs: move device boot warnings to the correct location Oded Gabbay
2021-11-28 19:34 ` [PATCH 05/12] habanalabs: fix race condition in multi CS completion Oded Gabbay
2021-11-28 19:34 ` [PATCH 06/12] habanalabs: add more info ioctls support during reset Oded Gabbay
2021-11-28 19:34 ` [PATCH 07/12] habanalabs: add power information type to POWER_GET packet Oded Gabbay
2021-11-28 19:34 ` [PATCH 08/12] habanalabs: change misleading IRQ warning during reset Oded Gabbay
2021-11-28 19:34 ` [PATCH 09/12] habanalabs: handle events during soft-reset Oded Gabbay
2021-11-28 19:34 ` [PATCH 10/12] habanalabs: skip read fw errors if dynamic descriptor invalid Oded Gabbay
2021-11-28 19:34 ` [PATCH 11/12] habanalabs: add SOB information to signal submission uAPI Oded Gabbay
2021-11-28 19:34 ` [PATCH 12/12] habanalabs: enable access to info ioctl during hard reset Oded Gabbay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).