All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] habanalabs: add reset support when user closes FD
@ 2021-02-22 22:08 Oded Gabbay
  2021-02-22 22:08 ` [PATCH 2/5] habanalabs: reset after device is actually released Oded Gabbay
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Oded Gabbay @ 2021-02-22 22:08 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

In order to support command submissions that are done directly from
user space, the driver must perform soft reset once user closes its FD.
In case the soft reset fails or device is not idle, a hard reset should
be performed.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c     | 21 +++++++++++++++++++--
 drivers/misc/habanalabs/common/habanalabs.h |  2 ++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 15fcb5c31c4b..ed1838c15c78 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -93,9 +93,26 @@ void hl_hpriv_put(struct hl_fpriv *hpriv)
 static int hl_device_release(struct inode *inode, struct file *filp)
 {
 	struct hl_fpriv *hpriv = filp->private_data;
+	struct hl_device *hdev = hpriv->hdev;
 
-	hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
-	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
+	hl_cb_mgr_fini(hdev, &hpriv->cb_mgr);
+	hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr);
+
+	if (hdev->reset_upon_device_release) {
+		u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
+
+		/* We try soft reset first */
+		hl_device_reset(hdev, false, false);
+
+		/* If device is not idle perform hard reset */
+		if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
+				HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
+			dev_info(hdev->dev,
+				"device is not idle (mask %#llx %#llx) after soft reset, performing hard reset",
+				idle_mask[0], idle_mask[1]);
+			hl_device_reset(hdev, true, false);
+		}
+	}
 
 	filp->private_data = NULL;
 
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index dc9f5a83dfc9..706361a81410 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1920,6 +1920,7 @@ struct hl_mmu_funcs {
  * @device_fini_pending: true if device_fini was called and might be
  *                       waiting for the reset thread to finish
  * @supports_staged_submission: true if staged submissions are supported
+ * @reset_upon_device_release: true if reset is required upon device release
  */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -2026,6 +2027,7 @@ struct hl_device {
 	u8				process_kill_trial_cnt;
 	u8				device_fini_pending;
 	u8				supports_staged_submission;
+	u8				reset_upon_device_release;
 
 	/* Parameters for bring-up */
 	u64				nic_ports_mask;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/5] habanalabs: reset after device is actually released
  2021-02-22 22:08 [PATCH 1/5] habanalabs: add reset support when user closes FD Oded Gabbay
@ 2021-02-22 22:08 ` Oded Gabbay
  2021-02-22 22:08 ` [PATCH 3/5] habanalabs: fail reset if device is not idle Oded Gabbay
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Oded Gabbay @ 2021-02-22 22:08 UTC (permalink / raw)
  To: linux-kernel

The device is actually released only after the refcnt of the hpriv
structure is 0, which means all its contexts were closed.

If we reset the device while a context is still open, there are
possibilities for unexpected behavior and crashes. For example, if the
process has a mapping of a register block that is now currently being
reset, and the process writes/reads to that block during the reset,
the device can get stuck.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 32 ++++++++++++-------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index ed1838c15c78..8cc3264ae378 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -70,6 +70,22 @@ static void hpriv_release(struct kref *ref)
 	mutex_unlock(&hdev->fpriv_list_lock);
 
 	kfree(hpriv);
+
+	if (hdev->reset_upon_device_release) {
+		u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
+
+		/* We try soft reset first */
+		hl_device_reset(hdev, false, false);
+
+		/* If device is not idle perform hard reset */
+		if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
+				HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
+			dev_info(hdev->dev,
+				"device is not idle (mask %#llx %#llx) after soft reset, performing hard reset",
+				idle_mask[0], idle_mask[1]);
+			hl_device_reset(hdev, true, false);
+		}
+	}
 }
 
 void hl_hpriv_get(struct hl_fpriv *hpriv)
@@ -98,22 +114,6 @@ static int hl_device_release(struct inode *inode, struct file *filp)
 	hl_cb_mgr_fini(hdev, &hpriv->cb_mgr);
 	hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr);
 
-	if (hdev->reset_upon_device_release) {
-		u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
-
-		/* We try soft reset first */
-		hl_device_reset(hdev, false, false);
-
-		/* If device is not idle perform hard reset */
-		if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
-				HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
-			dev_info(hdev->dev,
-				"device is not idle (mask %#llx %#llx) after soft reset, performing hard reset",
-				idle_mask[0], idle_mask[1]);
-			hl_device_reset(hdev, true, false);
-		}
-	}
-
 	filp->private_data = NULL;
 
 	hl_hpriv_put(hpriv);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/5] habanalabs: fail reset if device is not idle
  2021-02-22 22:08 [PATCH 1/5] habanalabs: add reset support when user closes FD Oded Gabbay
  2021-02-22 22:08 ` [PATCH 2/5] habanalabs: reset after device is actually released Oded Gabbay
@ 2021-02-22 22:08 ` Oded Gabbay
  2021-02-22 22:08 ` [PATCH 4/5] habanalabs: reset_upon_device_release is for bring-up Oded Gabbay
  2021-02-22 22:08 ` [PATCH 5/5] habanalabs: print if device is used on FD close Oded Gabbay
  3 siblings, 0 replies; 5+ messages in thread
From: Oded Gabbay @ 2021-02-22 22:08 UTC (permalink / raw)
  To: linux-kernel

After any reset (soft or hard) the device (the engines/QMANs) should
be idle. If they are not idle, fail the reset. If it is soft-reset,
the driver will try to do hard-reset automatically. If it is hard-reset,
the driver will make the device non-operational.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 8cc3264ae378..48d301a03d62 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -71,21 +71,8 @@ static void hpriv_release(struct kref *ref)
 
 	kfree(hpriv);
 
-	if (hdev->reset_upon_device_release) {
-		u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
-
-		/* We try soft reset first */
+	if (hdev->reset_upon_device_release)
 		hl_device_reset(hdev, false, false);
-
-		/* If device is not idle perform hard reset */
-		if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
-				HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
-			dev_info(hdev->dev,
-				"device is not idle (mask %#llx %#llx) after soft reset, performing hard reset",
-				idle_mask[0], idle_mask[1]);
-			hl_device_reset(hdev, true, false);
-		}
-	}
 }
 
 void hl_hpriv_get(struct hl_fpriv *hpriv)
@@ -921,6 +908,7 @@ static int device_kill_open_processes(struct hl_device *hdev, u32 timeout)
 int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 			bool from_hard_reset_thread)
 {
+	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
 	int i, rc;
 
 	if (!hdev->init_done) {
@@ -1140,6 +1128,15 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 		goto out_err;
 	}
 
+	/* If device is not idle fail the reset process */
+	if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
+			HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
+		dev_err(hdev->dev,
+			"device is not idle (mask %#llx %#llx) after reset\n",
+			idle_mask[0], idle_mask[1]);
+		goto out_err;
+	}
+
 	/* Check that the communication with the device is working */
 	rc = hdev->asic_funcs->test_queues(hdev);
 	if (rc) {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 4/5] habanalabs: reset_upon_device_release is for bring-up
  2021-02-22 22:08 [PATCH 1/5] habanalabs: add reset support when user closes FD Oded Gabbay
  2021-02-22 22:08 ` [PATCH 2/5] habanalabs: reset after device is actually released Oded Gabbay
  2021-02-22 22:08 ` [PATCH 3/5] habanalabs: fail reset if device is not idle Oded Gabbay
@ 2021-02-22 22:08 ` Oded Gabbay
  2021-02-22 22:08 ` [PATCH 5/5] habanalabs: print if device is used on FD close Oded Gabbay
  3 siblings, 0 replies; 5+ messages in thread
From: Oded Gabbay @ 2021-02-22 22:08 UTC (permalink / raw)
  To: linux-kernel

Move the field to correct location in structure and remove comment.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 706361a81410..9ba48f322964 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1920,7 +1920,6 @@ struct hl_mmu_funcs {
  * @device_fini_pending: true if device_fini was called and might be
  *                       waiting for the reset thread to finish
  * @supports_staged_submission: true if staged submissions are supported
- * @reset_upon_device_release: true if reset is required upon device release
  */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -2027,7 +2026,6 @@ struct hl_device {
 	u8				process_kill_trial_cnt;
 	u8				device_fini_pending;
 	u8				supports_staged_submission;
-	u8				reset_upon_device_release;
 
 	/* Parameters for bring-up */
 	u64				nic_ports_mask;
@@ -2045,6 +2043,7 @@ struct hl_device {
 	u8				bmc_enable;
 	u8				rl_enable;
 	u8				reset_on_preboot_fail;
+	u8				reset_upon_device_release;
 };
 
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 5/5] habanalabs: print if device is used on FD close
  2021-02-22 22:08 [PATCH 1/5] habanalabs: add reset support when user closes FD Oded Gabbay
                   ` (2 preceding siblings ...)
  2021-02-22 22:08 ` [PATCH 4/5] habanalabs: reset_upon_device_release is for bring-up Oded Gabbay
@ 2021-02-22 22:08 ` Oded Gabbay
  3 siblings, 0 replies; 5+ messages in thread
From: Oded Gabbay @ 2021-02-22 22:08 UTC (permalink / raw)
  To: linux-kernel

Notify to the user that although he closed the FD, the device is
still in use because there are live CS and/or memory mappings (mmaps).

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c     | 8 +++++---
 drivers/misc/habanalabs/common/habanalabs.h | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 48d301a03d62..6948a1c54083 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -80,9 +80,9 @@ void hl_hpriv_get(struct hl_fpriv *hpriv)
 	kref_get(&hpriv->refcount);
 }
 
-void hl_hpriv_put(struct hl_fpriv *hpriv)
+int hl_hpriv_put(struct hl_fpriv *hpriv)
 {
-	kref_put(&hpriv->refcount, hpriv_release);
+	return kref_put(&hpriv->refcount, hpriv_release);
 }
 
 /*
@@ -103,7 +103,9 @@ static int hl_device_release(struct inode *inode, struct file *filp)
 
 	filp->private_data = NULL;
 
-	hl_hpriv_put(hpriv);
+	if (!hl_hpriv_put(hpriv))
+		dev_warn(hdev->dev,
+			"Device is still in use because there are live CS and/or memory mappings\n");
 
 	return 0;
 }
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 9ba48f322964..046bb44f70f9 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2182,7 +2182,7 @@ int hl_device_resume(struct hl_device *hdev);
 int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 			bool from_hard_reset_thread);
 void hl_hpriv_get(struct hl_fpriv *hpriv);
-void hl_hpriv_put(struct hl_fpriv *hpriv);
+int hl_hpriv_put(struct hl_fpriv *hpriv);
 int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq);
 uint32_t hl_device_utilization(struct hl_device *hdev, uint32_t period_ms);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-02-22 22:10 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-22 22:08 [PATCH 1/5] habanalabs: add reset support when user closes FD Oded Gabbay
2021-02-22 22:08 ` [PATCH 2/5] habanalabs: reset after device is actually released Oded Gabbay
2021-02-22 22:08 ` [PATCH 3/5] habanalabs: fail reset if device is not idle Oded Gabbay
2021-02-22 22:08 ` [PATCH 4/5] habanalabs: reset_upon_device_release is for bring-up Oded Gabbay
2021-02-22 22:08 ` [PATCH 5/5] habanalabs: print if device is used on FD close Oded Gabbay

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.