linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Oded Gabbay <ogabbay@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: Ofir Bitton <obitton@habana.ai>
Subject: [PATCH 1/4] habanalabs: improve communication protocol with cpucp
Date: Tue,  2 Feb 2021 18:55:57 +0200	[thread overview]
Message-ID: <20210202165600.4620-1-ogabbay@kernel.org> (raw)

From: Ofir Bitton <obitton@habana.ai>

Current messaging communictaion protocol with cpucp can get out
of sync due to coherency issues. In order to improve the protocol
reliability, we modify the protocol to expect a different
acknowledgment for every packet sent to cpucp.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c    | 17 +++++++++++++++--
 drivers/misc/habanalabs/common/habanalabs.h     |  3 +++
 drivers/misc/habanalabs/gaudi/gaudi.c           |  6 +++++-
 drivers/misc/habanalabs/goya/goya.c             |  6 +++++-
 .../misc/habanalabs/include/common/hl_boot_if.h |  5 +++++
 5 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index ba6920f2b4ab..31b52a223f02 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -90,9 +90,10 @@ int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
 int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 				u16 len, u32 timeout, u64 *result)
 {
+	struct hl_hw_queue *queue = &hdev->kernel_queues[hw_queue_id];
 	struct cpucp_packet *pkt;
 	dma_addr_t pkt_dma_addr;
-	u32 tmp;
+	u32 tmp, expected_ack_val;
 	int rc = 0;
 
 	pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len,
@@ -115,14 +116,22 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 		goto out;
 	}
 
+	/* set fence to a non valid value */
+	pkt->fence = UINT_MAX;
+
 	rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, len, pkt_dma_addr);
 	if (rc) {
 		dev_err(hdev->dev, "Failed to send CB on CPU PQ (%d)\n", rc);
 		goto out;
 	}
 
+	if (hdev->asic_prop.fw_cpucp_ack_with_pi)
+		expected_ack_val = queue->pi;
+	else
+		expected_ack_val = CPUCP_PACKET_FENCE_VAL;
+
 	rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp,
-				(tmp == CPUCP_PACKET_FENCE_VAL), 1000,
+				(tmp == expected_ack_val), 1000,
 				timeout, true);
 
 	hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);
@@ -777,6 +786,10 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
 				CPU_BOOT_DEV_STS0_FW_HARD_RST_EN)
 			prop->hard_reset_done_by_fw = true;
 
+		if (prop->fw_boot_cpu_security_map &
+				CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN)
+			prop->fw_cpucp_ack_with_pi = true;
+
 		dev_dbg(hdev->dev,
 			"Firmware boot CPU security status %#x\n",
 			prop->fw_boot_cpu_security_map);
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 30f32f2edb8a..3c54010f7ab9 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -419,6 +419,8 @@ struct hl_mmu_properties {
  *                            from BOOT_DEV_STS0
  * @dram_supports_virtual_memory: is there an MMU towards the DRAM
  * @hard_reset_done_by_fw: true if firmware is handling hard reset flow
+ * @fw_cpucp_ack_with_pi: true if cpucp is acking messages with the PQ PI
+ *                        instead of a magic number
  * @num_functional_hbms: number of functional HBMs in each DCORE.
  */
 struct asic_fixed_properties {
@@ -479,6 +481,7 @@ struct asic_fixed_properties {
 	u8				fw_security_status_valid;
 	u8				dram_supports_virtual_memory;
 	u8				hard_reset_done_by_fw;
+	u8				fw_cpucp_ack_with_pi;
 	u8				num_functional_hbms;
 };
 
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 52fcaf25531a..006c34ae35c2 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -533,6 +533,7 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
 	prop->fw_security_disabled = true;
 	prop->fw_security_status_valid = false;
 	prop->hard_reset_done_by_fw = false;
+	prop->fw_cpucp_ack_with_pi = false;
 
 	return 0;
 }
@@ -4438,9 +4439,12 @@ static void gaudi_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
 	/* ring the doorbell */
 	WREG32(db_reg_offset, db_value);
 
-	if (hw_queue_id == GAUDI_QUEUE_ID_CPU_PQ)
+	if (hw_queue_id == GAUDI_QUEUE_ID_CPU_PQ) {
+		/* make sure device CPU will read latest data from host */
+		mb();
 		WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
 				GAUDI_EVENT_PI_UPDATE);
+	}
 }
 
 static void gaudi_pqe_write(struct hl_device *hdev, __le64 *pqe,
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index a954e7c02375..53db7e966866 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -461,6 +461,7 @@ int goya_get_fixed_properties(struct hl_device *hdev)
 	prop->fw_security_disabled = true;
 	prop->fw_security_status_valid = false;
 	prop->hard_reset_done_by_fw = false;
+	prop->fw_cpucp_ack_with_pi = false;
 
 	return 0;
 }
@@ -2806,9 +2807,12 @@ void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
 	/* ring the doorbell */
 	WREG32(db_reg_offset, db_value);
 
-	if (hw_queue_id == GOYA_QUEUE_ID_CPU_PQ)
+	if (hw_queue_id == GOYA_QUEUE_ID_CPU_PQ) {
+		/* make sure device CPU will read latest data from host */
+		mb();
 		WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
 				GOYA_ASYNC_EVENT_ID_PI_UPDATE);
+	}
 }
 
 void goya_pqe_write(struct hl_device *hdev, __le64 *pqe, struct hl_bd *bd)
diff --git a/drivers/misc/habanalabs/include/common/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h
index 57785478a4ef..e87f5a98e193 100644
--- a/drivers/misc/habanalabs/include/common/hl_boot_if.h
+++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h
@@ -166,6 +166,10 @@
  *					FW handles HBM ECC indications.
  *					Initialized in: linux
  *
+ * CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN	Packets ack value used in the armcpd
+ *					is set to the PI counter.
+ *					Initialized in: linux
+ *
  * CPU_BOOT_DEV_STS0_ENABLED		Device status register enabled.
  *					This is a main indication that the
  *					running FW populates the device status
@@ -190,6 +194,7 @@
 #define CPU_BOOT_DEV_STS0_SP_SRAM_EN			(1 << 12)
 #define CPU_BOOT_DEV_STS0_CLK_GATE_EN			(1 << 13)
 #define CPU_BOOT_DEV_STS0_HBM_ECC_EN			(1 << 14)
+#define CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN			(1 << 15)
 #define CPU_BOOT_DEV_STS0_ENABLED			(1 << 31)
 
 enum cpu_boot_status {
-- 
2.25.1


             reply	other threads:[~2021-02-02 17:11 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-02-02 16:55 Oded Gabbay [this message]
2021-02-02 16:55 ` [PATCH 2/4] habanalabs: support fetching first available user CQ Oded Gabbay
2021-02-02 16:55 ` [PATCH 3/4] habanalabs/gaudi: use HBM_ECC_EN bit for ECC ERR Oded Gabbay
2021-02-02 16:56 ` [PATCH 4/4] habanalabs: enable F/W events after init done Oded Gabbay

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210202165600.4620-1-ogabbay@kernel.org \
    --to=ogabbay@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=obitton@habana.ai \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).