linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Oded Gabbay <ogabbay@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: Yuri Nudelman <ynudelman@habana.ai>
Subject: [PATCH 3/6] habanalabs: add debug flag to prevent failure on timeout
Date: Sun,  6 Jun 2021 11:23:17 +0300	[thread overview]
Message-ID: <20210606082320.4319-3-ogabbay@kernel.org> (raw)
In-Reply-To: <20210606082320.4319-1-ogabbay@kernel.org>

From: Yuri Nudelman <ynudelman@habana.ai>

Sometimes it is useful to allow the command to continue running despite
the timeout occurred, to differentiate between really stuck or just very
time consuming commands. This can be achieved by passing a new debug
flag alongside the cs, HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT.

Anyway, if the timeout occurred, a warning print shall be issued,
however this shall not fail the submission.

Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 25 +++++++++++++++----
 drivers/misc/habanalabs/common/habanalabs.h   |  5 ++++
 include/uapi/misc/habanalabs.h                |  1 +
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index ecd96fbe3150..516d190164e4 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -556,6 +556,13 @@ static void cs_do_release(struct kref *ref)
 	else if (!cs->submitted)
 		cs->fence->error = -EBUSY;
 
+	if (unlikely(cs->skip_reset_on_timeout)) {
+		dev_err(hdev->dev,
+			"Command submission %llu completed after %llu (s)\n",
+			cs->sequence,
+			(jiffies - cs->submission_time_jiffies) / HZ);
+	}
+
 	if (cs->timestamp)
 		cs->fence->timestamp = ktime_get();
 	complete_all(&cs->fence->completion);
@@ -571,6 +578,8 @@ static void cs_timedout(struct work_struct *work)
 	int rc;
 	struct hl_cs *cs = container_of(work, struct hl_cs,
 						 work_tdr.work);
+	bool skip_reset_on_timeout = cs->skip_reset_on_timeout;
+
 	rc = cs_get_unless_zero(cs);
 	if (!rc)
 		return;
@@ -581,7 +590,8 @@ static void cs_timedout(struct work_struct *work)
 	}
 
 	/* Mark the CS is timed out so we won't try to cancel its TDR */
-	cs->timedout = true;
+	if (likely(!skip_reset_on_timeout))
+		cs->timedout = true;
 
 	hdev = cs->ctx->hdev;
 
@@ -613,10 +623,12 @@ static void cs_timedout(struct work_struct *work)
 
 	cs_put(cs);
 
-	if (hdev->reset_on_lockup)
-		hl_device_reset(hdev, HL_RESET_TDR);
-	else
-		hdev->needs_reset = true;
+	if (likely(!skip_reset_on_timeout)) {
+		if (hdev->reset_on_lockup)
+			hl_device_reset(hdev, HL_RESET_TDR);
+		else
+			hdev->needs_reset = true;
+	}
 }
 
 static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
@@ -650,6 +662,9 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	cs->type = cs_type;
 	cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP);
 	cs->timeout_jiffies = timeout;
+	cs->skip_reset_on_timeout =
+		!!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT);
+	cs->submission_time_jiffies = jiffies;
 	INIT_LIST_HEAD(&cs->job_list);
 	INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout);
 	kref_init(&cs->refcount);
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index e751868b3ed3..56d2f41f8893 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1421,6 +1421,7 @@ struct hl_userptr {
  * @staged_sequence: the sequence of the staged submission this CS is part of,
  *                   relevant only if staged_cs is set.
  * @timeout_jiffies: cs timeout in jiffies.
+ * @submission_time_jiffies: submission time of the cs
  * @type: CS_TYPE_*.
  * @submitted: true if CS was submitted to H/W.
  * @completed: true if CS was completed by device.
@@ -1433,6 +1434,8 @@ struct hl_userptr {
  * @staged_first: true if this is the first staged CS and we need to receive
  *                timeout for this CS.
  * @staged_cs: true if this CS is part of a staged submission.
+ * @skip_reset_on_timeout: true if we shall not reset the device in case
+ *                         timeout occurs (debug scenario).
  */
 struct hl_cs {
 	u16			*jobs_in_queue_cnt;
@@ -1450,6 +1453,7 @@ struct hl_cs {
 	u64			sequence;
 	u64			staged_sequence;
 	u64			timeout_jiffies;
+	u64			submission_time_jiffies;
 	enum hl_cs_type		type;
 	u8			submitted;
 	u8			completed;
@@ -1460,6 +1464,7 @@ struct hl_cs {
 	u8			staged_last;
 	u8			staged_first;
 	u8			staged_cs;
+	u8			skip_reset_on_timeout;
 };
 
 /**
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 6d2d34c9f375..a47485a8d411 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -664,6 +664,7 @@ struct hl_cs_chunk {
 #define HL_CS_FLAGS_STAGED_SUBMISSION_FIRST	0x80
 #define HL_CS_FLAGS_STAGED_SUBMISSION_LAST	0x100
 #define HL_CS_FLAGS_CUSTOM_TIMEOUT		0x200
+#define HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT	0x400
 
 #define HL_CS_STATUS_SUCCESS		0
 
-- 
2.25.1


  parent reply	other threads:[~2021-06-06  8:23 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-06-06  8:23 [PATCH 1/6] habanalabs/gaudi: don't use disabled ports in collective wait Oded Gabbay
2021-06-06  8:23 ` [PATCH 2/6] habanalabs/gaudi: add FW alive event support Oded Gabbay
2021-06-06  8:23 ` Oded Gabbay [this message]
2021-06-06  8:23 ` [PATCH 4/6] habanalabs: reset device upon FD close if not idle Oded Gabbay
2021-06-06  8:23 ` [PATCH 5/6] habanalabs: skip valid test for boot_dev_sts regs Oded Gabbay
2021-06-06  8:23 ` [PATCH 6/6] habanalabs: fix mask to obtain page offset Oded Gabbay

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210606082320.4319-3-ogabbay@kernel.org \
    --to=ogabbay@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=ynudelman@habana.ai \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).