linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Oded Gabbay <oded.gabbay@gmail.com>
To: linux-kernel@vger.kernel.org
Cc: gregkh@linuxfoundation.org
Subject: [PATCH 2/3] habanalabs: prevent host crash during suspend/resume
Date: Sat, 16 Mar 2019 22:43:16 +0200	[thread overview]
Message-ID: <20190316204317.28279-2-oded.gabbay@gmail.com> (raw)
In-Reply-To: <20190316204317.28279-1-oded.gabbay@gmail.com>

This patch fixes the implementation of suspend/resume of the device so that
upon resume of the device, the host won't crash due to PCI completion
timeout.

Upon suspend, the device is being reset due to PERST. Therefore, upon
resume, the driver must initialize the PCI controller as if the driver was
loaded. If the controller is not initialized and the device tries to
access the device through the PCI bars, the host will crash with PCI
completion timeout error.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/device.c    | 46 +++++++++++++++++++--
 drivers/misc/habanalabs/goya/goya.c | 63 +----------------------------
 2 files changed, 43 insertions(+), 66 deletions(-)

diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 470d8005b50e..77d51be66c7e 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -416,6 +416,27 @@ int hl_device_suspend(struct hl_device *hdev)
 
 	pci_save_state(hdev->pdev);
 
+	/* Block future CS/VM/JOB completion operations */
+	rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
+	if (rc) {
+		dev_err(hdev->dev, "Can't suspend while in reset\n");
+		return -EIO;
+	}
+
+	/* This blocks all other stuff that is not blocked by in_reset */
+	hdev->disabled = true;
+
+	/*
+	 * Flush anyone that is inside the critical section of enqueue
+	 * jobs to the H/W
+	 */
+	hdev->asic_funcs->hw_queues_lock(hdev);
+	hdev->asic_funcs->hw_queues_unlock(hdev);
+
+	/* Flush processes that are sending message to CPU */
+	mutex_lock(&hdev->send_cpu_message_lock);
+	mutex_unlock(&hdev->send_cpu_message_lock);
+
 	rc = hdev->asic_funcs->suspend(hdev);
 	if (rc)
 		dev_err(hdev->dev,
@@ -443,21 +464,38 @@ int hl_device_resume(struct hl_device *hdev)
 
 	pci_set_power_state(hdev->pdev, PCI_D0);
 	pci_restore_state(hdev->pdev);
-	rc = pci_enable_device(hdev->pdev);
+	rc = pci_enable_device_mem(hdev->pdev);
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to enable PCI device in resume\n");
 		return rc;
 	}
 
+	pci_set_master(hdev->pdev);
+
 	rc = hdev->asic_funcs->resume(hdev);
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to enable PCI access from device CPU\n");
-		return rc;
+		dev_err(hdev->dev, "Failed to resume device after suspend\n");
+		goto disable_device;
+	}
+
+
+	hdev->disabled = false;
+	atomic_set(&hdev->in_reset, 0);
+
+	rc = hl_device_reset(hdev, true, false);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to reset device during resume\n");
+		goto disable_device;
 	}
 
 	return 0;
+
+disable_device:
+	pci_clear_master(hdev->pdev);
+	pci_disable_device(hdev->pdev);
+
+	return rc;
 }
 
 static void hl_device_hard_reset_pending(struct work_struct *work)
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 238dd57c541b..538d8d59d9dc 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -1201,15 +1201,6 @@ static int goya_stop_external_queues(struct hl_device *hdev)
 	return retval;
 }
 
-static void goya_resume_external_queues(struct hl_device *hdev)
-{
-	WREG32(mmDMA_QM_0_GLBL_CFG1, 0);
-	WREG32(mmDMA_QM_1_GLBL_CFG1, 0);
-	WREG32(mmDMA_QM_2_GLBL_CFG1, 0);
-	WREG32(mmDMA_QM_3_GLBL_CFG1, 0);
-	WREG32(mmDMA_QM_4_GLBL_CFG1, 0);
-}
-
 /*
  * goya_init_cpu_queues - Initialize PQ/CQ/EQ of CPU
  *
@@ -2178,36 +2169,6 @@ static int goya_stop_internal_queues(struct hl_device *hdev)
 	return retval;
 }
 
-static void goya_resume_internal_queues(struct hl_device *hdev)
-{
-	WREG32(mmMME_QM_GLBL_CFG1, 0);
-	WREG32(mmMME_CMDQ_GLBL_CFG1, 0);
-
-	WREG32(mmTPC0_QM_GLBL_CFG1, 0);
-	WREG32(mmTPC0_CMDQ_GLBL_CFG1, 0);
-
-	WREG32(mmTPC1_QM_GLBL_CFG1, 0);
-	WREG32(mmTPC1_CMDQ_GLBL_CFG1, 0);
-
-	WREG32(mmTPC2_QM_GLBL_CFG1, 0);
-	WREG32(mmTPC2_CMDQ_GLBL_CFG1, 0);
-
-	WREG32(mmTPC3_QM_GLBL_CFG1, 0);
-	WREG32(mmTPC3_CMDQ_GLBL_CFG1, 0);
-
-	WREG32(mmTPC4_QM_GLBL_CFG1, 0);
-	WREG32(mmTPC4_CMDQ_GLBL_CFG1, 0);
-
-	WREG32(mmTPC5_QM_GLBL_CFG1, 0);
-	WREG32(mmTPC5_CMDQ_GLBL_CFG1, 0);
-
-	WREG32(mmTPC6_QM_GLBL_CFG1, 0);
-	WREG32(mmTPC6_CMDQ_GLBL_CFG1, 0);
-
-	WREG32(mmTPC7_QM_GLBL_CFG1, 0);
-	WREG32(mmTPC7_CMDQ_GLBL_CFG1, 0);
-}
-
 static void goya_dma_stall(struct hl_device *hdev)
 {
 	WREG32(mmDMA_QM_0_GLBL_CFG1, 1 << DMA_QM_0_GLBL_CFG1_DMA_STOP_SHIFT);
@@ -2905,20 +2866,6 @@ int goya_suspend(struct hl_device *hdev)
 {
 	int rc;
 
-	rc = goya_stop_internal_queues(hdev);
-
-	if (rc) {
-		dev_err(hdev->dev, "failed to stop internal queues\n");
-		return rc;
-	}
-
-	rc = goya_stop_external_queues(hdev);
-
-	if (rc) {
-		dev_err(hdev->dev, "failed to stop external queues\n");
-		return rc;
-	}
-
 	rc = goya_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS);
 	if (rc)
 		dev_err(hdev->dev, "Failed to disable PCI access from CPU\n");
@@ -2928,15 +2875,7 @@ int goya_suspend(struct hl_device *hdev)
 
 int goya_resume(struct hl_device *hdev)
 {
-	int rc;
-
-	goya_resume_external_queues(hdev);
-	goya_resume_internal_queues(hdev);
-
-	rc = goya_send_pci_access_msg(hdev, ARMCP_PACKET_ENABLE_PCI_ACCESS);
-	if (rc)
-		dev_err(hdev->dev, "Failed to enable PCI access from CPU\n");
-	return rc;
+	return goya_init_iatu(hdev);
 }
 
 static int goya_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
-- 
2.17.1


  reply	other threads:[~2019-03-16 20:43 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-03-16 20:43 [PATCH 1/3] habanalabs: perform accounting for active CS Oded Gabbay
2019-03-16 20:43 ` Oded Gabbay [this message]
2019-03-16 20:43 ` [PATCH 3/3] habanalabs: cast to expected type Oded Gabbay

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190316204317.28279-2-oded.gabbay@gmail.com \
    --to=oded.gabbay@gmail.com \
    --cc=gregkh@linuxfoundation.org \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).