linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/6] habanalabs/gaudi: set the correct cpu_id on MME2_QM failure
@ 2021-05-31 15:13 Oded Gabbay
  2021-05-31 15:13 ` [PATCH 2/6] habanalabs/gaudi: don't use nic_ports_mask in compute Oded Gabbay
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Oded Gabbay @ 2021-05-31 15:13 UTC (permalink / raw)
  To: linux-kernel; +Cc: Koby Elbaz

From: Koby Elbaz <kelbaz@habana.ai>

This fix was applied since there was an incorrect reported CPU ID to GIC
such that an error in MME2 QMAN aliased to be an arriving from DMA0_QM.

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 5ca4c8f86801..67c44f3fc55f 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -2956,7 +2956,7 @@ static void gaudi_init_mme_qman(struct hl_device *hdev, u32 mme_offset,
 
 		/* Configure RAZWI IRQ */
 		mme_id = mme_offset /
-				(mmMME1_QM_GLBL_CFG0 - mmMME0_QM_GLBL_CFG0);
+				(mmMME1_QM_GLBL_CFG0 - mmMME0_QM_GLBL_CFG0) / 2;
 
 		mme_qm_err_cfg = MME_QMAN_GLBL_ERR_CFG_MSG_EN_MASK;
 		if (hdev->stop_on_err) {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/6] habanalabs/gaudi: don't use nic_ports_mask in compute
  2021-05-31 15:13 [PATCH 1/6] habanalabs/gaudi: set the correct cpu_id on MME2_QM failure Oded Gabbay
@ 2021-05-31 15:13 ` Oded Gabbay
  2021-05-31 15:14 ` [PATCH 3/6] habanalabs/gaudi: add ARB to QM stop on error masks Oded Gabbay
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Oded Gabbay @ 2021-05-31 15:13 UTC (permalink / raw)
  To: linux-kernel

nic_ports_mask is used by the networking part of the driver.
In the compute part, we use the HW_CAP bits to select what is active
and what is not.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 67c44f3fc55f..1c4da5aff88c 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -6712,7 +6712,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
 	gaudi_mmu_prepare_reg(hdev, mmMME2_ACC_WBC, asid);
 	gaudi_mmu_prepare_reg(hdev, mmMME3_ACC_WBC, asid);
 
-	if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC0) {
+	if (gaudi->hw_cap_initialized & HW_CAP_NIC0) {
 		gaudi_mmu_prepare_reg(hdev, mmNIC0_QM0_GLBL_NON_SECURE_PROPS_0,
 				asid);
 		gaudi_mmu_prepare_reg(hdev, mmNIC0_QM0_GLBL_NON_SECURE_PROPS_1,
@@ -6725,7 +6725,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
 				asid);
 	}
 
-	if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC1) {
+	if (gaudi->hw_cap_initialized & HW_CAP_NIC1) {
 		gaudi_mmu_prepare_reg(hdev, mmNIC0_QM1_GLBL_NON_SECURE_PROPS_0,
 				asid);
 		gaudi_mmu_prepare_reg(hdev, mmNIC0_QM1_GLBL_NON_SECURE_PROPS_1,
@@ -6738,7 +6738,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
 				asid);
 	}
 
-	if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC2) {
+	if (gaudi->hw_cap_initialized & HW_CAP_NIC2) {
 		gaudi_mmu_prepare_reg(hdev, mmNIC1_QM0_GLBL_NON_SECURE_PROPS_0,
 				asid);
 		gaudi_mmu_prepare_reg(hdev, mmNIC1_QM0_GLBL_NON_SECURE_PROPS_1,
@@ -6751,7 +6751,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
 				asid);
 	}
 
-	if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC3) {
+	if (gaudi->hw_cap_initialized & HW_CAP_NIC3) {
 		gaudi_mmu_prepare_reg(hdev, mmNIC1_QM1_GLBL_NON_SECURE_PROPS_0,
 				asid);
 		gaudi_mmu_prepare_reg(hdev, mmNIC1_QM1_GLBL_NON_SECURE_PROPS_1,
@@ -6764,7 +6764,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
 				asid);
 	}
 
-	if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC4) {
+	if (gaudi->hw_cap_initialized & HW_CAP_NIC4) {
 		gaudi_mmu_prepare_reg(hdev, mmNIC2_QM0_GLBL_NON_SECURE_PROPS_0,
 				asid);
 		gaudi_mmu_prepare_reg(hdev, mmNIC2_QM0_GLBL_NON_SECURE_PROPS_1,
@@ -6777,7 +6777,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
 				asid);
 	}
 
-	if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC5) {
+	if (gaudi->hw_cap_initialized & HW_CAP_NIC5) {
 		gaudi_mmu_prepare_reg(hdev, mmNIC2_QM1_GLBL_NON_SECURE_PROPS_0,
 				asid);
 		gaudi_mmu_prepare_reg(hdev, mmNIC2_QM1_GLBL_NON_SECURE_PROPS_1,
@@ -6790,7 +6790,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
 				asid);
 	}
 
-	if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC6) {
+	if (gaudi->hw_cap_initialized & HW_CAP_NIC6) {
 		gaudi_mmu_prepare_reg(hdev, mmNIC3_QM0_GLBL_NON_SECURE_PROPS_0,
 				asid);
 		gaudi_mmu_prepare_reg(hdev, mmNIC3_QM0_GLBL_NON_SECURE_PROPS_1,
@@ -6803,7 +6803,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
 				asid);
 	}
 
-	if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC7) {
+	if (gaudi->hw_cap_initialized & HW_CAP_NIC7) {
 		gaudi_mmu_prepare_reg(hdev, mmNIC3_QM1_GLBL_NON_SECURE_PROPS_0,
 				asid);
 		gaudi_mmu_prepare_reg(hdev, mmNIC3_QM1_GLBL_NON_SECURE_PROPS_1,
@@ -6816,7 +6816,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
 				asid);
 	}
 
-	if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC8) {
+	if (gaudi->hw_cap_initialized & HW_CAP_NIC8) {
 		gaudi_mmu_prepare_reg(hdev, mmNIC4_QM0_GLBL_NON_SECURE_PROPS_0,
 				asid);
 		gaudi_mmu_prepare_reg(hdev, mmNIC4_QM0_GLBL_NON_SECURE_PROPS_1,
@@ -6829,7 +6829,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
 				asid);
 	}
 
-	if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC9) {
+	if (gaudi->hw_cap_initialized & HW_CAP_NIC9) {
 		gaudi_mmu_prepare_reg(hdev, mmNIC4_QM1_GLBL_NON_SECURE_PROPS_0,
 				asid);
 		gaudi_mmu_prepare_reg(hdev, mmNIC4_QM1_GLBL_NON_SECURE_PROPS_1,
@@ -8239,7 +8239,7 @@ static bool gaudi_is_device_idle(struct hl_device *hdev, u64 *mask_arr,
 	for (i = 0 ; i < (NIC_NUMBER_OF_ENGINES / 2) ; i++) {
 		offset = i * NIC_MACRO_QMAN_OFFSET;
 		port = 2 * i;
-		if (hdev->nic_ports_mask & BIT(port)) {
+		if (gaudi->hw_cap_initialized & BIT(HW_CAP_NIC_SHIFT + port)) {
 			qm_glbl_sts0 = RREG32(mmNIC0_QM0_GLBL_STS0 + offset);
 			qm_cgm_sts = RREG32(mmNIC0_QM0_CGM_STS + offset);
 			is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts);
@@ -8254,7 +8254,7 @@ static bool gaudi_is_device_idle(struct hl_device *hdev, u64 *mask_arr,
 		}
 
 		port = 2 * i + 1;
-		if (hdev->nic_ports_mask & BIT(port)) {
+		if (gaudi->hw_cap_initialized & BIT(HW_CAP_NIC_SHIFT + port)) {
 			qm_glbl_sts0 = RREG32(mmNIC0_QM1_GLBL_STS0 + offset);
 			qm_cgm_sts = RREG32(mmNIC0_QM1_CGM_STS + offset);
 			is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/6] habanalabs/gaudi: add ARB to QM stop on error masks
  2021-05-31 15:13 [PATCH 1/6] habanalabs/gaudi: set the correct cpu_id on MME2_QM failure Oded Gabbay
  2021-05-31 15:13 ` [PATCH 2/6] habanalabs/gaudi: don't use nic_ports_mask in compute Oded Gabbay
@ 2021-05-31 15:14 ` Oded Gabbay
  2021-05-31 15:14 ` [PATCH 4/6] habanalabs: prefer ASYNC device probing Oded Gabbay
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Oded Gabbay @ 2021-05-31 15:14 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

Update the QM stop on error masks to also stop on ARB errors.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c           | 17 +++++++----------
 .../misc/habanalabs/include/gaudi/gaudi_masks.h | 15 ++++++++++-----
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 1c4da5aff88c..3f760b932eee 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -2642,10 +2642,9 @@ static void gaudi_init_pci_dma_qman(struct hl_device *hdev, int dma_id,
 
 		/* Configure RAZWI IRQ */
 		dma_qm_err_cfg = PCI_DMA_QMAN_GLBL_ERR_CFG_MSG_EN_MASK;
-		if (hdev->stop_on_err) {
+		if (hdev->stop_on_err)
 			dma_qm_err_cfg |=
 				PCI_DMA_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK;
-		}
 
 		WREG32(mmDMA0_QM_GLBL_ERR_CFG + dma_qm_offset, dma_qm_err_cfg);
 
@@ -2822,10 +2821,10 @@ static void gaudi_init_hbm_dma_qman(struct hl_device *hdev, int dma_id,
 
 		/* Configure RAZWI IRQ */
 		dma_qm_err_cfg = HBM_DMA_QMAN_GLBL_ERR_CFG_MSG_EN_MASK;
-		if (hdev->stop_on_err) {
+		if (hdev->stop_on_err)
 			dma_qm_err_cfg |=
 				HBM_DMA_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK;
-		}
+
 		WREG32(mmDMA0_QM_GLBL_ERR_CFG + dma_qm_offset, dma_qm_err_cfg);
 
 		WREG32(mmDMA0_QM_GLBL_ERR_ADDR_LO + dma_qm_offset,
@@ -2959,10 +2958,10 @@ static void gaudi_init_mme_qman(struct hl_device *hdev, u32 mme_offset,
 				(mmMME1_QM_GLBL_CFG0 - mmMME0_QM_GLBL_CFG0) / 2;
 
 		mme_qm_err_cfg = MME_QMAN_GLBL_ERR_CFG_MSG_EN_MASK;
-		if (hdev->stop_on_err) {
+		if (hdev->stop_on_err)
 			mme_qm_err_cfg |=
 				MME_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK;
-		}
+
 		WREG32(mmMME0_QM_GLBL_ERR_CFG + mme_offset, mme_qm_err_cfg);
 
 		WREG32(mmMME0_QM_GLBL_ERR_ADDR_LO + mme_offset,
@@ -3093,10 +3092,9 @@ static void gaudi_init_tpc_qman(struct hl_device *hdev, u32 tpc_offset,
 
 		/* Configure RAZWI IRQ */
 		tpc_qm_err_cfg = TPC_QMAN_GLBL_ERR_CFG_MSG_EN_MASK;
-		if (hdev->stop_on_err) {
+		if (hdev->stop_on_err)
 			tpc_qm_err_cfg |=
 				TPC_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK;
-		}
 
 		WREG32(mmTPC0_QM_GLBL_ERR_CFG + tpc_offset, tpc_qm_err_cfg);
 
@@ -3245,10 +3243,9 @@ static void gaudi_init_nic_qman(struct hl_device *hdev, u32 nic_offset,
 
 		/* Configure RAZWI IRQ */
 		nic_qm_err_cfg = NIC_QMAN_GLBL_ERR_CFG_MSG_EN_MASK;
-		if (hdev->stop_on_err) {
+		if (hdev->stop_on_err)
 			nic_qm_err_cfg |=
 				NIC_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK;
-		}
 
 		WREG32(mmNIC0_QM0_GLBL_ERR_CFG + nic_offset, nic_qm_err_cfg);
 
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h b/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h
index b53aeda9a982..9aea7e996654 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h
@@ -66,7 +66,8 @@
 #define PCI_DMA_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK	(\
 	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
 	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0xF)) | \
-	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0xF)))
+	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0xF)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_ARB_STOP_ON_ERR_MASK, 0x1)))
 
 #define HBM_DMA_QMAN_GLBL_ERR_CFG_MSG_EN_MASK	(\
 	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_MASK, 0xF)) | \
@@ -76,7 +77,8 @@
 #define HBM_DMA_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK	(\
 	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
 	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0x1F)) | \
-	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)))
+	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_ARB_STOP_ON_ERR_MASK, 0x1)))
 
 #define TPC_QMAN_GLBL_ERR_CFG_MSG_EN_MASK	(\
 	(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_MASK, 0xF)) | \
@@ -86,7 +88,8 @@
 #define TPC_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK	(\
 	(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
 	(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0x1F)) | \
-	(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)))
+	(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)) | \
+	(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_ARB_STOP_ON_ERR_MASK, 0x1)))
 
 #define MME_QMAN_GLBL_ERR_CFG_MSG_EN_MASK	(\
 	(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_MASK, 0xF)) | \
@@ -96,7 +99,8 @@
 #define MME_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK	(\
 	(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
 	(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0x1F)) | \
-	(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)))
+	(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)) | \
+	(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_ARB_STOP_ON_ERR_MASK, 0x1)))
 
 #define NIC_QMAN_GLBL_ERR_CFG_MSG_EN_MASK	(\
 	(FIELD_PREP(NIC0_QM0_GLBL_ERR_CFG_PQF_ERR_MSG_EN_MASK, 0xF)) | \
@@ -106,7 +110,8 @@
 #define NIC_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK	(\
 	(FIELD_PREP(NIC0_QM0_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
 	(FIELD_PREP(NIC0_QM0_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0xF)) | \
-	(FIELD_PREP(NIC0_QM0_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0xF)))
+	(FIELD_PREP(NIC0_QM0_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0xF)) | \
+	(FIELD_PREP(NIC0_QM0_GLBL_ERR_CFG_ARB_STOP_ON_ERR_MASK, 0x1)))
 
 #define QMAN_CGM1_PWR_GATE_EN	(FIELD_PREP(DMA0_QM_CGM_CFG1_MASK_TH_MASK, 0xA))
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/6] habanalabs: prefer ASYNC device probing
  2021-05-31 15:13 [PATCH 1/6] habanalabs/gaudi: set the correct cpu_id on MME2_QM failure Oded Gabbay
  2021-05-31 15:13 ` [PATCH 2/6] habanalabs/gaudi: don't use nic_ports_mask in compute Oded Gabbay
  2021-05-31 15:14 ` [PATCH 3/6] habanalabs/gaudi: add ARB to QM stop on error masks Oded Gabbay
@ 2021-05-31 15:14 ` Oded Gabbay
  2021-05-31 15:14 ` [PATCH 5/6] habanalabs/gaudi: split host irq interfaces towards FW Oded Gabbay
  2021-05-31 15:14 ` [PATCH 6/6] habanalabs/gaudi: update to latest f/w specs Oded Gabbay
  4 siblings, 0 replies; 6+ messages in thread
From: Oded Gabbay @ 2021-05-31 15:14 UTC (permalink / raw)
  To: linux-kernel

There is no dependency when probing multiple devices so indicate to the
kernel that it can probe our devices in ASYNC fashion.

This shortens insmod of the driver from ~2 minutes to 20 seconds on
a system with 8 devices.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs_drv.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index bd67d4ceab56..137e7dc63d3b 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -574,7 +574,11 @@ static struct pci_driver hl_pci_driver = {
 	.probe = hl_pci_probe,
 	.remove = hl_pci_remove,
 	.shutdown = hl_pci_remove,
-	.driver.pm = &hl_pm_ops,
+	.driver = {
+		.name = HL_NAME,
+		.pm = &hl_pm_ops,
+		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
+	},
 	.err_handler = &hl_pci_err_handler,
 };
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 5/6] habanalabs/gaudi: split host irq interfaces towards FW
  2021-05-31 15:13 [PATCH 1/6] habanalabs/gaudi: set the correct cpu_id on MME2_QM failure Oded Gabbay
                   ` (2 preceding siblings ...)
  2021-05-31 15:14 ` [PATCH 4/6] habanalabs: prefer ASYNC device probing Oded Gabbay
@ 2021-05-31 15:14 ` Oded Gabbay
  2021-05-31 15:14 ` [PATCH 6/6] habanalabs/gaudi: update to latest f/w specs Oded Gabbay
  4 siblings, 0 replies; 6+ messages in thread
From: Oded Gabbay @ 2021-05-31 15:14 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Current implementation uses a single interrupt interface towards
FW, this interface is causing races between interrupt types.
We split this interface to interface per interrupt type.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c  | 26 +++++++++++++++++--
 drivers/misc/habanalabs/gaudi/gaudi.c         |  8 +++---
 .../habanalabs/include/common/hl_boot_if.h    | 14 ++++++++--
 .../habanalabs/include/gaudi/gaudi_reg_map.h  |  4 +++
 4 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index 4cc6690a3e26..40e91985cb48 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -1782,7 +1782,8 @@ static void hl_fw_boot_fit_update_state(struct hl_device *hdev,
 
 	/* Read boot_cpu status bits */
 	if (prop->fw_cpu_boot_dev_sts0_valid) {
-		prop->fw_bootfit_cpu_boot_dev_sts0 = RREG32(cpu_boot_dev_sts0_reg);
+		prop->fw_bootfit_cpu_boot_dev_sts0 =
+				RREG32(cpu_boot_dev_sts0_reg);
 
 		if (prop->fw_bootfit_cpu_boot_dev_sts0 &
 				CPU_BOOT_DEV_STS0_FW_HARD_RST_EN)
@@ -1793,7 +1794,8 @@ static void hl_fw_boot_fit_update_state(struct hl_device *hdev,
 	}
 
 	if (prop->fw_cpu_boot_dev_sts1_valid) {
-		prop->fw_bootfit_cpu_boot_dev_sts1 = RREG32(cpu_boot_dev_sts1_reg);
+		prop->fw_bootfit_cpu_boot_dev_sts1 =
+				RREG32(cpu_boot_dev_sts1_reg);
 
 		dev_dbg(hdev->dev, "Firmware boot CPU status1 %#x\n",
 					prop->fw_bootfit_cpu_boot_dev_sts1);
@@ -1803,6 +1805,24 @@ static void hl_fw_boot_fit_update_state(struct hl_device *hdev,
 			prop->hard_reset_done_by_fw ? "enabled" : "disabled");
 }
 
+static void hl_fw_dynamic_update_linux_interrupt_if(struct hl_device *hdev)
+{
+	struct cpu_dyn_regs *dyn_regs =
+			&hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs;
+
+	/* Check whether all 3 interrupt interfaces are set, if not use a
+	 * single interface
+	 */
+	if (!hdev->asic_prop.gic_interrupts_enable &&
+			!(hdev->asic_prop.fw_app_cpu_boot_dev_sts0 &
+				CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN)) {
+		dyn_regs->gic_host_halt_irq = dyn_regs->gic_host_irq_ctrl;
+		dyn_regs->gic_host_ints_irq = dyn_regs->gic_host_irq_ctrl;
+
+		dev_warn(hdev->dev,
+			"Using a single interrupt interface towards cpucp");
+	}
+}
 /**
  * hl_fw_dynamic_load_image - load FW image using dynamic protocol
  *
@@ -2150,6 +2170,8 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 	hl_fw_linux_update_state(hdev, le32_to_cpu(dyn_regs->cpu_boot_dev_sts0),
 				le32_to_cpu(dyn_regs->cpu_boot_dev_sts1));
 
+	hl_fw_dynamic_update_linux_interrupt_if(hdev);
+
 	return 0;
 
 protocol_err:
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 3f760b932eee..007248946b63 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -3962,7 +3962,7 @@ static int gaudi_init_cpu_queues(struct hl_device *hdev, u32 cpu_timeout)
 
 	irq_handler_offset = prop->gic_interrupts_enable ?
 			mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
-			le32_to_cpu(dyn_regs->gic_host_irq_ctrl);
+			le32_to_cpu(dyn_regs->gic_host_pi_upd_irq);
 
 	WREG32(irq_handler_offset, GAUDI_EVENT_PI_UPDATE);
 
@@ -4148,7 +4148,7 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
 	if (hdev->fw_loader.linux_loaded) {
 		irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ?
 				mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
-				le32_to_cpu(dyn_regs->gic_host_irq_ctrl);
+				le32_to_cpu(dyn_regs->gic_host_halt_irq);
 
 		WREG32(irq_handler_offset, GAUDI_EVENT_HALT_MACHINE);
 	} else {
@@ -4681,7 +4681,7 @@ static void gaudi_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
 
 		irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ?
 				mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
-				le32_to_cpu(dyn_regs->gic_host_irq_ctrl);
+				le32_to_cpu(dyn_regs->gic_host_pi_upd_irq);
 
 		WREG32(irq_handler_offset, GAUDI_EVENT_PI_UPDATE);
 	}
@@ -8912,7 +8912,7 @@ static void gaudi_enable_events_from_fw(struct hl_device *hdev)
 			&hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs;
 	u32 irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ?
 			mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
-			le32_to_cpu(dyn_regs->gic_host_irq_ctrl);
+			le32_to_cpu(dyn_regs->gic_host_ints_irq);
 
 	WREG32(irq_handler_offset, GAUDI_EVENT_INTS_REGISTER);
 }
diff --git a/drivers/misc/habanalabs/include/common/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h
index 6d0c1ddb4304..89ac8020f821 100644
--- a/drivers/misc/habanalabs/include/common/hl_boot_if.h
+++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h
@@ -205,6 +205,10 @@
  *					was not served before.
  *					Initialized in: linux
  *
+ * CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN  Use multiple scratchpad interfaces to
+ *					prevent IRQs overriding each other.
+ *					Initialized in: linux
+ *
  * CPU_BOOT_DEV_STS0_ENABLED		Device status register enabled.
  *					This is a main indication that the
  *					running FW populates the device status
@@ -235,6 +239,7 @@
 #define CPU_BOOT_DEV_STS0_DYN_PLL_EN			(1 << 19)
 #define CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN		(1 << 20)
 #define CPU_BOOT_DEV_STS0_EQ_INDEX_EN			(1 << 21)
+#define CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN		(1 << 22)
 #define CPU_BOOT_DEV_STS0_ENABLED			(1 << 31)
 #define CPU_BOOT_DEV_STS1_ENABLED			(1 << 31)
 
@@ -308,13 +313,18 @@ struct cpu_dyn_regs {
 	__le32 hw_state;
 	__le32 kmd_msg_to_cpu;
 	__le32 cpu_cmd_status_to_host;
-	__le32 gic_host_irq_ctrl;
+	union {
+		__le32 gic_host_irq_ctrl;
+		__le32 gic_host_pi_upd_irq;
+	};
 	__le32 gic_tpc_qm_irq_ctrl;
 	__le32 gic_mme_qm_irq_ctrl;
 	__le32 gic_dma_qm_irq_ctrl;
 	__le32 gic_nic_qm_irq_ctrl;
 	__le32 gic_dma_core_irq_ctrl;
-	__le32 reserved1[26];		/* reserve for future use */
+	__le32 gic_host_halt_irq;
+	__le32 gic_host_ints_irq;
+	__le32 reserved1[24];		/* reserve for future use */
 };
 
 /* TODO: remove the desc magic after the code is updated to use message */
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h b/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h
index cd69d3407631..d95d4162ae2c 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h
@@ -12,12 +12,16 @@
  * PSOC scratch-pad registers
  */
 #define mmHW_STATE			mmPSOC_GLOBAL_CONF_SCRATCHPAD_0
+/* TODO: remove mmGIC_HOST_IRQ_CTRL_POLL_REG */
 #define mmGIC_HOST_IRQ_CTRL_POLL_REG	mmPSOC_GLOBAL_CONF_SCRATCHPAD_1
+#define mmGIC_HOST_PI_UPD_IRQ_POLL_REG	mmPSOC_GLOBAL_CONF_SCRATCHPAD_1
 #define mmGIC_TPC_QM_IRQ_CTRL_POLL_REG	mmPSOC_GLOBAL_CONF_SCRATCHPAD_2
 #define mmGIC_MME_QM_IRQ_CTRL_POLL_REG	mmPSOC_GLOBAL_CONF_SCRATCHPAD_3
 #define mmGIC_DMA_QM_IRQ_CTRL_POLL_REG	mmPSOC_GLOBAL_CONF_SCRATCHPAD_4
 #define mmGIC_NIC_QM_IRQ_CTRL_POLL_REG	mmPSOC_GLOBAL_CONF_SCRATCHPAD_5
 #define mmGIC_DMA_CR_IRQ_CTRL_POLL_REG	mmPSOC_GLOBAL_CONF_SCRATCHPAD_6
+#define mmGIC_HOST_HALT_IRQ_POLL_REG	mmPSOC_GLOBAL_CONF_SCRATCHPAD_7
+#define mmGIC_HOST_INTS_IRQ_POLL_REG	mmPSOC_GLOBAL_CONF_SCRATCHPAD_8
 #define mmCPU_BOOT_DEV_STS0		mmPSOC_GLOBAL_CONF_SCRATCHPAD_20
 #define mmCPU_BOOT_DEV_STS1		mmPSOC_GLOBAL_CONF_SCRATCHPAD_21
 #define mmFUSE_VER_OFFSET		mmPSOC_GLOBAL_CONF_SCRATCHPAD_22
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 6/6] habanalabs/gaudi: update to latest f/w specs
  2021-05-31 15:13 [PATCH 1/6] habanalabs/gaudi: set the correct cpu_id on MME2_QM failure Oded Gabbay
                   ` (3 preceding siblings ...)
  2021-05-31 15:14 ` [PATCH 5/6] habanalabs/gaudi: split host irq interfaces towards FW Oded Gabbay
@ 2021-05-31 15:14 ` Oded Gabbay
  4 siblings, 0 replies; 6+ messages in thread
From: Oded Gabbay @ 2021-05-31 15:14 UTC (permalink / raw)
  To: linux-kernel

Update the firmware interface files to their latest version.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/include/common/hl_boot_if.h    | 36 +++++++++++++------
 .../habanalabs/include/gaudi/gaudi_fw_if.h    |  7 ++++
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/drivers/misc/habanalabs/include/common/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h
index 89ac8020f821..fa8a5ad2d438 100644
--- a/drivers/misc/habanalabs/include/common/hl_boot_if.h
+++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h
@@ -333,24 +333,41 @@ struct cpu_dyn_regs {
 #define HL_COMMS_DESC_VER	1
 
 /* HCMv - Habana Communications Message + header version */
-#define HL_COMMS_MSG_MAGIC_VER(ver)	(0x48434D00 | ((ver) & 0xff))
+#define HL_COMMS_MSG_MAGIC_VALUE	0x48434D00
+#define HL_COMMS_MSG_MAGIC_MASK		0xFFFFFF00
+#define HL_COMMS_MSG_MAGIC_VER_MASK	0xFF
+
+#define HL_COMMS_MSG_MAGIC_VER(ver)	(HL_COMMS_MSG_MAGIC_VALUE |	\
+					((ver) & HL_COMMS_MSG_MAGIC_VER_MASK))
 #define HL_COMMS_MSG_MAGIC_V0		HL_COMMS_DESC_MAGIC
 #define HL_COMMS_MSG_MAGIC_V1		HL_COMMS_MSG_MAGIC_VER(1)
 
 #define HL_COMMS_MSG_MAGIC		HL_COMMS_MSG_MAGIC_V1
 
+#define HL_COMMS_MSG_MAGIC_VALIDATE_MAGIC(magic)			\
+		(((magic) & HL_COMMS_MSG_MAGIC_MASK) ==			\
+		HL_COMMS_MSG_MAGIC_VALUE)
+
+#define HL_COMMS_MSG_MAGIC_VALIDATE_VERSION(magic, ver)			\
+		(((magic) & HL_COMMS_MSG_MAGIC_VER_MASK) >=		\
+		((ver) & HL_COMMS_MSG_MAGIC_VER_MASK))
+
+#define HL_COMMS_MSG_MAGIC_VALIDATE(magic, ver)				\
+		(HL_COMMS_MSG_MAGIC_VALIDATE_MAGIC((magic)) &&		\
+		HL_COMMS_MSG_MAGIC_VALIDATE_VERSION((magic), (ver)))
+
 enum comms_msg_type {
 	HL_COMMS_DESC_TYPE = 0,
 	HL_COMMS_RESET_CAUSE_TYPE = 1,
 };
 
-/* TODO: remove this struct after the code is updated to use comms_msg_header */
+/* TODO: remove this struct after the code is updated to use message */
 /* this is the comms descriptor header - meta data */
 struct comms_desc_header {
 	__le32 magic;		/* magic for validation */
 	__le32 crc32;		/* CRC32 of the descriptor w/o header */
 	__le16 size;		/* size of the descriptor w/o header */
-	__u8 version;		/* descriptor version */
+	__u8 version;	/* descriptor version */
 	__u8 reserved[5];	/* pad to 64 bit */
 };
 
@@ -359,7 +376,7 @@ struct comms_msg_header {
 	__le32 magic;		/* magic for validation */
 	__le32 crc32;		/* CRC32 of the message w/o header */
 	__le16 size;		/* size of the message w/o header */
-	__u8 version;		/* message payload version */
+	__u8 version;	/* message payload version */
 	__u8 type;		/* message type */
 	__u8 reserved[4];	/* pad to 64 bit */
 };
@@ -372,8 +389,7 @@ struct lkd_fw_comms_desc {
 	char cur_fw_ver[VERSION_MAX_LEN];
 	/* can be used for 1 more version w/o ABI change */
 	char reserved0[VERSION_MAX_LEN];
-	/* address for next FW component load */
-	__le64 img_addr;
+	__le64 img_addr;	/* address for next FW component load */
 };
 
 enum comms_reset_cause {
@@ -382,10 +398,11 @@ enum comms_reset_cause {
 	HL_RESET_CAUSE_TDR = 2,
 };
 
-#define RESET_CAUSE_PADDING	7
+/* TODO: remove define after struct name is aligned on all projects */
+#define lkd_msg_comms lkd_fw_comms_msg
 
 /* this is the comms message descriptor */
-struct lkd_msg_comms {
+struct lkd_fw_comms_msg {
 	struct comms_msg_header header;
 	/* union for future expantions of new messages */
 	union {
@@ -400,7 +417,6 @@ struct lkd_msg_comms {
 		};
 		struct {
 			__u8 reset_cause;
-			__u8 reserved[RESET_CAUSE_PADDING]; /* 64 bit pad */
 		};
 	};
 };
@@ -474,7 +490,7 @@ enum comms_cmd {
 struct comms_command {
 	union {		/* bit fields are only for FW use */
 		struct {
-			u32 size :25;			/* 32MB max. */
+			u32 size :25;		/* 32MB max. */
 			u32 reserved :2;
 			enum comms_cmd cmd :5;		/* 32 commands */
 		};
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_fw_if.h b/drivers/misc/habanalabs/include/gaudi/gaudi_fw_if.h
index a4afb984d0ae..34ca4fe50d91 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_fw_if.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_fw_if.h
@@ -20,6 +20,9 @@
 #define UBOOT_FW_OFFSET			0x100000	/* 1MB in SRAM */
 #define LINUX_FW_OFFSET			0x800000	/* 8MB in HBM */
 
+/* HBM thermal delta in [Deg] added to composite (CTemp) */
+#define HBM_TEMP_ADJUST_COEFF		6
+
 enum gaudi_nic_axi_error {
 	RXB,
 	RXE,
@@ -56,6 +59,8 @@ struct eq_nic_sei_event {
  * @pcs_link: has PCS link.
  * @phy_ready: is PHY ready.
  * @auto_neg: is Autoneg enabled.
+ * @timeout_retransmission_cnt: timeout retransmission events
+ * @high_ber_cnt: high ber events
  */
 struct gaudi_nic_status {
 	__u32 port;
@@ -69,6 +74,8 @@ struct gaudi_nic_status {
 	__u8 pcs_link;
 	__u8 phy_ready;
 	__u8 auto_neg;
+	__u32 timeout_retransmission_cnt;
+	__u32 high_ber_cnt;
 };
 
 struct gaudi_flops_2_data {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-05-31 16:59 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-31 15:13 [PATCH 1/6] habanalabs/gaudi: set the correct cpu_id on MME2_QM failure Oded Gabbay
2021-05-31 15:13 ` [PATCH 2/6] habanalabs/gaudi: don't use nic_ports_mask in compute Oded Gabbay
2021-05-31 15:14 ` [PATCH 3/6] habanalabs/gaudi: add ARB to QM stop on error masks Oded Gabbay
2021-05-31 15:14 ` [PATCH 4/6] habanalabs: prefer ASYNC device probing Oded Gabbay
2021-05-31 15:14 ` [PATCH 5/6] habanalabs/gaudi: split host irq interfaces towards FW Oded Gabbay
2021-05-31 15:14 ` [PATCH 6/6] habanalabs/gaudi: update to latest f/w specs Oded Gabbay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).