* [PATCH 1/6] habanalabs/gaudi: set the correct cpu_id on MME2_QM failure
@ 2021-05-31 15:13 Oded Gabbay
2021-05-31 15:13 ` [PATCH 2/6] habanalabs/gaudi: don't use nic_ports_mask in compute Oded Gabbay
` (4 more replies)
0 siblings, 5 replies; 6+ messages in thread
From: Oded Gabbay @ 2021-05-31 15:13 UTC (permalink / raw)
To: linux-kernel; +Cc: Koby Elbaz
From: Koby Elbaz <kelbaz@habana.ai>
This fix was applied since there was an incorrect reported CPU ID to GIC
such that an error in MME2 QMAN aliased to be an arriving from DMA0_QM.
Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
drivers/misc/habanalabs/gaudi/gaudi.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 5ca4c8f86801..67c44f3fc55f 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -2956,7 +2956,7 @@ static void gaudi_init_mme_qman(struct hl_device *hdev, u32 mme_offset,
/* Configure RAZWI IRQ */
mme_id = mme_offset /
- (mmMME1_QM_GLBL_CFG0 - mmMME0_QM_GLBL_CFG0);
+ (mmMME1_QM_GLBL_CFG0 - mmMME0_QM_GLBL_CFG0) / 2;
mme_qm_err_cfg = MME_QMAN_GLBL_ERR_CFG_MSG_EN_MASK;
if (hdev->stop_on_err) {
--
2.25.1
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 2/6] habanalabs/gaudi: don't use nic_ports_mask in compute
2021-05-31 15:13 [PATCH 1/6] habanalabs/gaudi: set the correct cpu_id on MME2_QM failure Oded Gabbay
@ 2021-05-31 15:13 ` Oded Gabbay
2021-05-31 15:14 ` [PATCH 3/6] habanalabs/gaudi: add ARB to QM stop on error masks Oded Gabbay
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Oded Gabbay @ 2021-05-31 15:13 UTC (permalink / raw)
To: linux-kernel
nic_ports_mask is used by the networking part of the driver.
In the compute part, we use the HW_CAP bits to select what is active
and what is not.
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
drivers/misc/habanalabs/gaudi/gaudi.c | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 67c44f3fc55f..1c4da5aff88c 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -6712,7 +6712,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
gaudi_mmu_prepare_reg(hdev, mmMME2_ACC_WBC, asid);
gaudi_mmu_prepare_reg(hdev, mmMME3_ACC_WBC, asid);
- if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC0) {
+ if (gaudi->hw_cap_initialized & HW_CAP_NIC0) {
gaudi_mmu_prepare_reg(hdev, mmNIC0_QM0_GLBL_NON_SECURE_PROPS_0,
asid);
gaudi_mmu_prepare_reg(hdev, mmNIC0_QM0_GLBL_NON_SECURE_PROPS_1,
@@ -6725,7 +6725,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
asid);
}
- if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC1) {
+ if (gaudi->hw_cap_initialized & HW_CAP_NIC1) {
gaudi_mmu_prepare_reg(hdev, mmNIC0_QM1_GLBL_NON_SECURE_PROPS_0,
asid);
gaudi_mmu_prepare_reg(hdev, mmNIC0_QM1_GLBL_NON_SECURE_PROPS_1,
@@ -6738,7 +6738,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
asid);
}
- if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC2) {
+ if (gaudi->hw_cap_initialized & HW_CAP_NIC2) {
gaudi_mmu_prepare_reg(hdev, mmNIC1_QM0_GLBL_NON_SECURE_PROPS_0,
asid);
gaudi_mmu_prepare_reg(hdev, mmNIC1_QM0_GLBL_NON_SECURE_PROPS_1,
@@ -6751,7 +6751,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
asid);
}
- if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC3) {
+ if (gaudi->hw_cap_initialized & HW_CAP_NIC3) {
gaudi_mmu_prepare_reg(hdev, mmNIC1_QM1_GLBL_NON_SECURE_PROPS_0,
asid);
gaudi_mmu_prepare_reg(hdev, mmNIC1_QM1_GLBL_NON_SECURE_PROPS_1,
@@ -6764,7 +6764,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
asid);
}
- if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC4) {
+ if (gaudi->hw_cap_initialized & HW_CAP_NIC4) {
gaudi_mmu_prepare_reg(hdev, mmNIC2_QM0_GLBL_NON_SECURE_PROPS_0,
asid);
gaudi_mmu_prepare_reg(hdev, mmNIC2_QM0_GLBL_NON_SECURE_PROPS_1,
@@ -6777,7 +6777,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
asid);
}
- if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC5) {
+ if (gaudi->hw_cap_initialized & HW_CAP_NIC5) {
gaudi_mmu_prepare_reg(hdev, mmNIC2_QM1_GLBL_NON_SECURE_PROPS_0,
asid);
gaudi_mmu_prepare_reg(hdev, mmNIC2_QM1_GLBL_NON_SECURE_PROPS_1,
@@ -6790,7 +6790,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
asid);
}
- if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC6) {
+ if (gaudi->hw_cap_initialized & HW_CAP_NIC6) {
gaudi_mmu_prepare_reg(hdev, mmNIC3_QM0_GLBL_NON_SECURE_PROPS_0,
asid);
gaudi_mmu_prepare_reg(hdev, mmNIC3_QM0_GLBL_NON_SECURE_PROPS_1,
@@ -6803,7 +6803,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
asid);
}
- if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC7) {
+ if (gaudi->hw_cap_initialized & HW_CAP_NIC7) {
gaudi_mmu_prepare_reg(hdev, mmNIC3_QM1_GLBL_NON_SECURE_PROPS_0,
asid);
gaudi_mmu_prepare_reg(hdev, mmNIC3_QM1_GLBL_NON_SECURE_PROPS_1,
@@ -6816,7 +6816,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
asid);
}
- if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC8) {
+ if (gaudi->hw_cap_initialized & HW_CAP_NIC8) {
gaudi_mmu_prepare_reg(hdev, mmNIC4_QM0_GLBL_NON_SECURE_PROPS_0,
asid);
gaudi_mmu_prepare_reg(hdev, mmNIC4_QM0_GLBL_NON_SECURE_PROPS_1,
@@ -6829,7 +6829,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
asid);
}
- if (hdev->nic_ports_mask & GAUDI_NIC_MASK_NIC9) {
+ if (gaudi->hw_cap_initialized & HW_CAP_NIC9) {
gaudi_mmu_prepare_reg(hdev, mmNIC4_QM1_GLBL_NON_SECURE_PROPS_0,
asid);
gaudi_mmu_prepare_reg(hdev, mmNIC4_QM1_GLBL_NON_SECURE_PROPS_1,
@@ -8239,7 +8239,7 @@ static bool gaudi_is_device_idle(struct hl_device *hdev, u64 *mask_arr,
for (i = 0 ; i < (NIC_NUMBER_OF_ENGINES / 2) ; i++) {
offset = i * NIC_MACRO_QMAN_OFFSET;
port = 2 * i;
- if (hdev->nic_ports_mask & BIT(port)) {
+ if (gaudi->hw_cap_initialized & BIT(HW_CAP_NIC_SHIFT + port)) {
qm_glbl_sts0 = RREG32(mmNIC0_QM0_GLBL_STS0 + offset);
qm_cgm_sts = RREG32(mmNIC0_QM0_CGM_STS + offset);
is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts);
@@ -8254,7 +8254,7 @@ static bool gaudi_is_device_idle(struct hl_device *hdev, u64 *mask_arr,
}
port = 2 * i + 1;
- if (hdev->nic_ports_mask & BIT(port)) {
+ if (gaudi->hw_cap_initialized & BIT(HW_CAP_NIC_SHIFT + port)) {
qm_glbl_sts0 = RREG32(mmNIC0_QM1_GLBL_STS0 + offset);
qm_cgm_sts = RREG32(mmNIC0_QM1_CGM_STS + offset);
is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts);
--
2.25.1
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 3/6] habanalabs/gaudi: add ARB to QM stop on error masks
2021-05-31 15:13 [PATCH 1/6] habanalabs/gaudi: set the correct cpu_id on MME2_QM failure Oded Gabbay
2021-05-31 15:13 ` [PATCH 2/6] habanalabs/gaudi: don't use nic_ports_mask in compute Oded Gabbay
@ 2021-05-31 15:14 ` Oded Gabbay
2021-05-31 15:14 ` [PATCH 4/6] habanalabs: prefer ASYNC device probing Oded Gabbay
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Oded Gabbay @ 2021-05-31 15:14 UTC (permalink / raw)
To: linux-kernel; +Cc: Tomer Tayar
From: Tomer Tayar <ttayar@habana.ai>
Update the QM stop on error masks to also stop on ARB errors.
Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
drivers/misc/habanalabs/gaudi/gaudi.c | 17 +++++++----------
.../misc/habanalabs/include/gaudi/gaudi_masks.h | 15 ++++++++++-----
2 files changed, 17 insertions(+), 15 deletions(-)
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 1c4da5aff88c..3f760b932eee 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -2642,10 +2642,9 @@ static void gaudi_init_pci_dma_qman(struct hl_device *hdev, int dma_id,
/* Configure RAZWI IRQ */
dma_qm_err_cfg = PCI_DMA_QMAN_GLBL_ERR_CFG_MSG_EN_MASK;
- if (hdev->stop_on_err) {
+ if (hdev->stop_on_err)
dma_qm_err_cfg |=
PCI_DMA_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK;
- }
WREG32(mmDMA0_QM_GLBL_ERR_CFG + dma_qm_offset, dma_qm_err_cfg);
@@ -2822,10 +2821,10 @@ static void gaudi_init_hbm_dma_qman(struct hl_device *hdev, int dma_id,
/* Configure RAZWI IRQ */
dma_qm_err_cfg = HBM_DMA_QMAN_GLBL_ERR_CFG_MSG_EN_MASK;
- if (hdev->stop_on_err) {
+ if (hdev->stop_on_err)
dma_qm_err_cfg |=
HBM_DMA_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK;
- }
+
WREG32(mmDMA0_QM_GLBL_ERR_CFG + dma_qm_offset, dma_qm_err_cfg);
WREG32(mmDMA0_QM_GLBL_ERR_ADDR_LO + dma_qm_offset,
@@ -2959,10 +2958,10 @@ static void gaudi_init_mme_qman(struct hl_device *hdev, u32 mme_offset,
(mmMME1_QM_GLBL_CFG0 - mmMME0_QM_GLBL_CFG0) / 2;
mme_qm_err_cfg = MME_QMAN_GLBL_ERR_CFG_MSG_EN_MASK;
- if (hdev->stop_on_err) {
+ if (hdev->stop_on_err)
mme_qm_err_cfg |=
MME_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK;
- }
+
WREG32(mmMME0_QM_GLBL_ERR_CFG + mme_offset, mme_qm_err_cfg);
WREG32(mmMME0_QM_GLBL_ERR_ADDR_LO + mme_offset,
@@ -3093,10 +3092,9 @@ static void gaudi_init_tpc_qman(struct hl_device *hdev, u32 tpc_offset,
/* Configure RAZWI IRQ */
tpc_qm_err_cfg = TPC_QMAN_GLBL_ERR_CFG_MSG_EN_MASK;
- if (hdev->stop_on_err) {
+ if (hdev->stop_on_err)
tpc_qm_err_cfg |=
TPC_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK;
- }
WREG32(mmTPC0_QM_GLBL_ERR_CFG + tpc_offset, tpc_qm_err_cfg);
@@ -3245,10 +3243,9 @@ static void gaudi_init_nic_qman(struct hl_device *hdev, u32 nic_offset,
/* Configure RAZWI IRQ */
nic_qm_err_cfg = NIC_QMAN_GLBL_ERR_CFG_MSG_EN_MASK;
- if (hdev->stop_on_err) {
+ if (hdev->stop_on_err)
nic_qm_err_cfg |=
NIC_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK;
- }
WREG32(mmNIC0_QM0_GLBL_ERR_CFG + nic_offset, nic_qm_err_cfg);
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h b/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h
index b53aeda9a982..9aea7e996654 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h
@@ -66,7 +66,8 @@
#define PCI_DMA_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK (\
(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0xF)) | \
- (FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0xF)))
+ (FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0xF)) | \
+ (FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_ARB_STOP_ON_ERR_MASK, 0x1)))
#define HBM_DMA_QMAN_GLBL_ERR_CFG_MSG_EN_MASK (\
(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_MASK, 0xF)) | \
@@ -76,7 +77,8 @@
#define HBM_DMA_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK (\
(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0x1F)) | \
- (FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)))
+ (FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)) | \
+ (FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_ARB_STOP_ON_ERR_MASK, 0x1)))
#define TPC_QMAN_GLBL_ERR_CFG_MSG_EN_MASK (\
(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_MASK, 0xF)) | \
@@ -86,7 +88,8 @@
#define TPC_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK (\
(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0x1F)) | \
- (FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)))
+ (FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)) | \
+ (FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_ARB_STOP_ON_ERR_MASK, 0x1)))
#define MME_QMAN_GLBL_ERR_CFG_MSG_EN_MASK (\
(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_MASK, 0xF)) | \
@@ -96,7 +99,8 @@
#define MME_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK (\
(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0x1F)) | \
- (FIELD_PREP(MME0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)))
+ (FIELD_PREP(MME0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)) | \
+ (FIELD_PREP(MME0_QM_GLBL_ERR_CFG_ARB_STOP_ON_ERR_MASK, 0x1)))
#define NIC_QMAN_GLBL_ERR_CFG_MSG_EN_MASK (\
(FIELD_PREP(NIC0_QM0_GLBL_ERR_CFG_PQF_ERR_MSG_EN_MASK, 0xF)) | \
@@ -106,7 +110,8 @@
#define NIC_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK (\
(FIELD_PREP(NIC0_QM0_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
(FIELD_PREP(NIC0_QM0_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0xF)) | \
- (FIELD_PREP(NIC0_QM0_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0xF)))
+ (FIELD_PREP(NIC0_QM0_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0xF)) | \
+ (FIELD_PREP(NIC0_QM0_GLBL_ERR_CFG_ARB_STOP_ON_ERR_MASK, 0x1)))
#define QMAN_CGM1_PWR_GATE_EN (FIELD_PREP(DMA0_QM_CGM_CFG1_MASK_TH_MASK, 0xA))
--
2.25.1
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 4/6] habanalabs: prefer ASYNC device probing
2021-05-31 15:13 [PATCH 1/6] habanalabs/gaudi: set the correct cpu_id on MME2_QM failure Oded Gabbay
2021-05-31 15:13 ` [PATCH 2/6] habanalabs/gaudi: don't use nic_ports_mask in compute Oded Gabbay
2021-05-31 15:14 ` [PATCH 3/6] habanalabs/gaudi: add ARB to QM stop on error masks Oded Gabbay
@ 2021-05-31 15:14 ` Oded Gabbay
2021-05-31 15:14 ` [PATCH 5/6] habanalabs/gaudi: split host irq interfaces towards FW Oded Gabbay
2021-05-31 15:14 ` [PATCH 6/6] habanalabs/gaudi: update to latest f/w specs Oded Gabbay
4 siblings, 0 replies; 6+ messages in thread
From: Oded Gabbay @ 2021-05-31 15:14 UTC (permalink / raw)
To: linux-kernel
There is no dependency when probing multiple devices so indicate to the
kernel that it can probe our devices in ASYNC fashion.
This shortens insmod of the driver from ~2 minutes to 20 seconds on
a system with 8 devices.
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
drivers/misc/habanalabs/common/habanalabs_drv.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index bd67d4ceab56..137e7dc63d3b 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -574,7 +574,11 @@ static struct pci_driver hl_pci_driver = {
.probe = hl_pci_probe,
.remove = hl_pci_remove,
.shutdown = hl_pci_remove,
- .driver.pm = &hl_pm_ops,
+ .driver = {
+ .name = HL_NAME,
+ .pm = &hl_pm_ops,
+ .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+ },
.err_handler = &hl_pci_err_handler,
};
--
2.25.1
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 5/6] habanalabs/gaudi: split host irq interfaces towards FW
2021-05-31 15:13 [PATCH 1/6] habanalabs/gaudi: set the correct cpu_id on MME2_QM failure Oded Gabbay
` (2 preceding siblings ...)
2021-05-31 15:14 ` [PATCH 4/6] habanalabs: prefer ASYNC device probing Oded Gabbay
@ 2021-05-31 15:14 ` Oded Gabbay
2021-05-31 15:14 ` [PATCH 6/6] habanalabs/gaudi: update to latest f/w specs Oded Gabbay
4 siblings, 0 replies; 6+ messages in thread
From: Oded Gabbay @ 2021-05-31 15:14 UTC (permalink / raw)
To: linux-kernel; +Cc: Ofir Bitton
From: Ofir Bitton <obitton@habana.ai>
Current implementation uses a single interrupt interface towards
FW, this interface is causing races between interrupt types.
We split this interface to interface per interrupt type.
Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
drivers/misc/habanalabs/common/firmware_if.c | 26 +++++++++++++++++--
drivers/misc/habanalabs/gaudi/gaudi.c | 8 +++---
.../habanalabs/include/common/hl_boot_if.h | 14 ++++++++--
.../habanalabs/include/gaudi/gaudi_reg_map.h | 4 +++
4 files changed, 44 insertions(+), 8 deletions(-)
diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index 4cc6690a3e26..40e91985cb48 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -1782,7 +1782,8 @@ static void hl_fw_boot_fit_update_state(struct hl_device *hdev,
/* Read boot_cpu status bits */
if (prop->fw_cpu_boot_dev_sts0_valid) {
- prop->fw_bootfit_cpu_boot_dev_sts0 = RREG32(cpu_boot_dev_sts0_reg);
+ prop->fw_bootfit_cpu_boot_dev_sts0 =
+ RREG32(cpu_boot_dev_sts0_reg);
if (prop->fw_bootfit_cpu_boot_dev_sts0 &
CPU_BOOT_DEV_STS0_FW_HARD_RST_EN)
@@ -1793,7 +1794,8 @@ static void hl_fw_boot_fit_update_state(struct hl_device *hdev,
}
if (prop->fw_cpu_boot_dev_sts1_valid) {
- prop->fw_bootfit_cpu_boot_dev_sts1 = RREG32(cpu_boot_dev_sts1_reg);
+ prop->fw_bootfit_cpu_boot_dev_sts1 =
+ RREG32(cpu_boot_dev_sts1_reg);
dev_dbg(hdev->dev, "Firmware boot CPU status1 %#x\n",
prop->fw_bootfit_cpu_boot_dev_sts1);
@@ -1803,6 +1805,24 @@ static void hl_fw_boot_fit_update_state(struct hl_device *hdev,
prop->hard_reset_done_by_fw ? "enabled" : "disabled");
}
+static void hl_fw_dynamic_update_linux_interrupt_if(struct hl_device *hdev)
+{
+ struct cpu_dyn_regs *dyn_regs =
+ &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs;
+
+ /* Check whether all 3 interrupt interfaces are set, if not use a
+ * single interface
+ */
+ if (!hdev->asic_prop.gic_interrupts_enable &&
+ !(hdev->asic_prop.fw_app_cpu_boot_dev_sts0 &
+ CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN)) {
+ dyn_regs->gic_host_halt_irq = dyn_regs->gic_host_irq_ctrl;
+ dyn_regs->gic_host_ints_irq = dyn_regs->gic_host_irq_ctrl;
+
+ dev_warn(hdev->dev,
+ "Using a single interrupt interface towards cpucp");
+ }
+}
/**
* hl_fw_dynamic_load_image - load FW image using dynamic protocol
*
@@ -2150,6 +2170,8 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
hl_fw_linux_update_state(hdev, le32_to_cpu(dyn_regs->cpu_boot_dev_sts0),
le32_to_cpu(dyn_regs->cpu_boot_dev_sts1));
+ hl_fw_dynamic_update_linux_interrupt_if(hdev);
+
return 0;
protocol_err:
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 3f760b932eee..007248946b63 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -3962,7 +3962,7 @@ static int gaudi_init_cpu_queues(struct hl_device *hdev, u32 cpu_timeout)
irq_handler_offset = prop->gic_interrupts_enable ?
mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
- le32_to_cpu(dyn_regs->gic_host_irq_ctrl);
+ le32_to_cpu(dyn_regs->gic_host_pi_upd_irq);
WREG32(irq_handler_offset, GAUDI_EVENT_PI_UPDATE);
@@ -4148,7 +4148,7 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
if (hdev->fw_loader.linux_loaded) {
irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ?
mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
- le32_to_cpu(dyn_regs->gic_host_irq_ctrl);
+ le32_to_cpu(dyn_regs->gic_host_halt_irq);
WREG32(irq_handler_offset, GAUDI_EVENT_HALT_MACHINE);
} else {
@@ -4681,7 +4681,7 @@ static void gaudi_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ?
mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
- le32_to_cpu(dyn_regs->gic_host_irq_ctrl);
+ le32_to_cpu(dyn_regs->gic_host_pi_upd_irq);
WREG32(irq_handler_offset, GAUDI_EVENT_PI_UPDATE);
}
@@ -8912,7 +8912,7 @@ static void gaudi_enable_events_from_fw(struct hl_device *hdev)
&hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs;
u32 irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ?
mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
- le32_to_cpu(dyn_regs->gic_host_irq_ctrl);
+ le32_to_cpu(dyn_regs->gic_host_ints_irq);
WREG32(irq_handler_offset, GAUDI_EVENT_INTS_REGISTER);
}
diff --git a/drivers/misc/habanalabs/include/common/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h
index 6d0c1ddb4304..89ac8020f821 100644
--- a/drivers/misc/habanalabs/include/common/hl_boot_if.h
+++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h
@@ -205,6 +205,10 @@
* was not served before.
* Initialized in: linux
*
+ * CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN Use multiple scratchpad interfaces to
+ * prevent IRQs overriding each other.
+ * Initialized in: linux
+ *
* CPU_BOOT_DEV_STS0_ENABLED Device status register enabled.
* This is a main indication that the
* running FW populates the device status
@@ -235,6 +239,7 @@
#define CPU_BOOT_DEV_STS0_DYN_PLL_EN (1 << 19)
#define CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN (1 << 20)
#define CPU_BOOT_DEV_STS0_EQ_INDEX_EN (1 << 21)
+#define CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN (1 << 22)
#define CPU_BOOT_DEV_STS0_ENABLED (1 << 31)
#define CPU_BOOT_DEV_STS1_ENABLED (1 << 31)
@@ -308,13 +313,18 @@ struct cpu_dyn_regs {
__le32 hw_state;
__le32 kmd_msg_to_cpu;
__le32 cpu_cmd_status_to_host;
- __le32 gic_host_irq_ctrl;
+ union {
+ __le32 gic_host_irq_ctrl;
+ __le32 gic_host_pi_upd_irq;
+ };
__le32 gic_tpc_qm_irq_ctrl;
__le32 gic_mme_qm_irq_ctrl;
__le32 gic_dma_qm_irq_ctrl;
__le32 gic_nic_qm_irq_ctrl;
__le32 gic_dma_core_irq_ctrl;
- __le32 reserved1[26]; /* reserve for future use */
+ __le32 gic_host_halt_irq;
+ __le32 gic_host_ints_irq;
+ __le32 reserved1[24]; /* reserve for future use */
};
/* TODO: remove the desc magic after the code is updated to use message */
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h b/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h
index cd69d3407631..d95d4162ae2c 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h
@@ -12,12 +12,16 @@
* PSOC scratch-pad registers
*/
#define mmHW_STATE mmPSOC_GLOBAL_CONF_SCRATCHPAD_0
+/* TODO: remove mmGIC_HOST_IRQ_CTRL_POLL_REG */
#define mmGIC_HOST_IRQ_CTRL_POLL_REG mmPSOC_GLOBAL_CONF_SCRATCHPAD_1
+#define mmGIC_HOST_PI_UPD_IRQ_POLL_REG mmPSOC_GLOBAL_CONF_SCRATCHPAD_1
#define mmGIC_TPC_QM_IRQ_CTRL_POLL_REG mmPSOC_GLOBAL_CONF_SCRATCHPAD_2
#define mmGIC_MME_QM_IRQ_CTRL_POLL_REG mmPSOC_GLOBAL_CONF_SCRATCHPAD_3
#define mmGIC_DMA_QM_IRQ_CTRL_POLL_REG mmPSOC_GLOBAL_CONF_SCRATCHPAD_4
#define mmGIC_NIC_QM_IRQ_CTRL_POLL_REG mmPSOC_GLOBAL_CONF_SCRATCHPAD_5
#define mmGIC_DMA_CR_IRQ_CTRL_POLL_REG mmPSOC_GLOBAL_CONF_SCRATCHPAD_6
+#define mmGIC_HOST_HALT_IRQ_POLL_REG mmPSOC_GLOBAL_CONF_SCRATCHPAD_7
+#define mmGIC_HOST_INTS_IRQ_POLL_REG mmPSOC_GLOBAL_CONF_SCRATCHPAD_8
#define mmCPU_BOOT_DEV_STS0 mmPSOC_GLOBAL_CONF_SCRATCHPAD_20
#define mmCPU_BOOT_DEV_STS1 mmPSOC_GLOBAL_CONF_SCRATCHPAD_21
#define mmFUSE_VER_OFFSET mmPSOC_GLOBAL_CONF_SCRATCHPAD_22
--
2.25.1
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 6/6] habanalabs/gaudi: update to latest f/w specs
2021-05-31 15:13 [PATCH 1/6] habanalabs/gaudi: set the correct cpu_id on MME2_QM failure Oded Gabbay
` (3 preceding siblings ...)
2021-05-31 15:14 ` [PATCH 5/6] habanalabs/gaudi: split host irq interfaces towards FW Oded Gabbay
@ 2021-05-31 15:14 ` Oded Gabbay
4 siblings, 0 replies; 6+ messages in thread
From: Oded Gabbay @ 2021-05-31 15:14 UTC (permalink / raw)
To: linux-kernel
Update the firmware interface files to their latest version.
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
.../habanalabs/include/common/hl_boot_if.h | 36 +++++++++++++------
.../habanalabs/include/gaudi/gaudi_fw_if.h | 7 ++++
2 files changed, 33 insertions(+), 10 deletions(-)
diff --git a/drivers/misc/habanalabs/include/common/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h
index 89ac8020f821..fa8a5ad2d438 100644
--- a/drivers/misc/habanalabs/include/common/hl_boot_if.h
+++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h
@@ -333,24 +333,41 @@ struct cpu_dyn_regs {
#define HL_COMMS_DESC_VER 1
/* HCMv - Habana Communications Message + header version */
-#define HL_COMMS_MSG_MAGIC_VER(ver) (0x48434D00 | ((ver) & 0xff))
+#define HL_COMMS_MSG_MAGIC_VALUE 0x48434D00
+#define HL_COMMS_MSG_MAGIC_MASK 0xFFFFFF00
+#define HL_COMMS_MSG_MAGIC_VER_MASK 0xFF
+
+#define HL_COMMS_MSG_MAGIC_VER(ver) (HL_COMMS_MSG_MAGIC_VALUE | \
+ ((ver) & HL_COMMS_MSG_MAGIC_VER_MASK))
#define HL_COMMS_MSG_MAGIC_V0 HL_COMMS_DESC_MAGIC
#define HL_COMMS_MSG_MAGIC_V1 HL_COMMS_MSG_MAGIC_VER(1)
#define HL_COMMS_MSG_MAGIC HL_COMMS_MSG_MAGIC_V1
+#define HL_COMMS_MSG_MAGIC_VALIDATE_MAGIC(magic) \
+ (((magic) & HL_COMMS_MSG_MAGIC_MASK) == \
+ HL_COMMS_MSG_MAGIC_VALUE)
+
+#define HL_COMMS_MSG_MAGIC_VALIDATE_VERSION(magic, ver) \
+ (((magic) & HL_COMMS_MSG_MAGIC_VER_MASK) >= \
+ ((ver) & HL_COMMS_MSG_MAGIC_VER_MASK))
+
+#define HL_COMMS_MSG_MAGIC_VALIDATE(magic, ver) \
+ (HL_COMMS_MSG_MAGIC_VALIDATE_MAGIC((magic)) && \
+ HL_COMMS_MSG_MAGIC_VALIDATE_VERSION((magic), (ver)))
+
enum comms_msg_type {
HL_COMMS_DESC_TYPE = 0,
HL_COMMS_RESET_CAUSE_TYPE = 1,
};
-/* TODO: remove this struct after the code is updated to use comms_msg_header */
+/* TODO: remove this struct after the code is updated to use message */
/* this is the comms descriptor header - meta data */
struct comms_desc_header {
__le32 magic; /* magic for validation */
__le32 crc32; /* CRC32 of the descriptor w/o header */
__le16 size; /* size of the descriptor w/o header */
- __u8 version; /* descriptor version */
+ __u8 version; /* descriptor version */
__u8 reserved[5]; /* pad to 64 bit */
};
@@ -359,7 +376,7 @@ struct comms_msg_header {
__le32 magic; /* magic for validation */
__le32 crc32; /* CRC32 of the message w/o header */
__le16 size; /* size of the message w/o header */
- __u8 version; /* message payload version */
+ __u8 version; /* message payload version */
__u8 type; /* message type */
__u8 reserved[4]; /* pad to 64 bit */
};
@@ -372,8 +389,7 @@ struct lkd_fw_comms_desc {
char cur_fw_ver[VERSION_MAX_LEN];
/* can be used for 1 more version w/o ABI change */
char reserved0[VERSION_MAX_LEN];
- /* address for next FW component load */
- __le64 img_addr;
+ __le64 img_addr; /* address for next FW component load */
};
enum comms_reset_cause {
@@ -382,10 +398,11 @@ enum comms_reset_cause {
HL_RESET_CAUSE_TDR = 2,
};
-#define RESET_CAUSE_PADDING 7
+/* TODO: remove define after struct name is aligned on all projects */
+#define lkd_msg_comms lkd_fw_comms_msg
/* this is the comms message descriptor */
-struct lkd_msg_comms {
+struct lkd_fw_comms_msg {
struct comms_msg_header header;
/* union for future expantions of new messages */
union {
@@ -400,7 +417,6 @@ struct lkd_msg_comms {
};
struct {
__u8 reset_cause;
- __u8 reserved[RESET_CAUSE_PADDING]; /* 64 bit pad */
};
};
};
@@ -474,7 +490,7 @@ enum comms_cmd {
struct comms_command {
union { /* bit fields are only for FW use */
struct {
- u32 size :25; /* 32MB max. */
+ u32 size :25; /* 32MB max. */
u32 reserved :2;
enum comms_cmd cmd :5; /* 32 commands */
};
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_fw_if.h b/drivers/misc/habanalabs/include/gaudi/gaudi_fw_if.h
index a4afb984d0ae..34ca4fe50d91 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_fw_if.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_fw_if.h
@@ -20,6 +20,9 @@
#define UBOOT_FW_OFFSET 0x100000 /* 1MB in SRAM */
#define LINUX_FW_OFFSET 0x800000 /* 8MB in HBM */
+/* HBM thermal delta in [Deg] added to composite (CTemp) */
+#define HBM_TEMP_ADJUST_COEFF 6
+
enum gaudi_nic_axi_error {
RXB,
RXE,
@@ -56,6 +59,8 @@ struct eq_nic_sei_event {
* @pcs_link: has PCS link.
* @phy_ready: is PHY ready.
* @auto_neg: is Autoneg enabled.
+ * @timeout_retransmission_cnt: timeout retransmission events
+ * @high_ber_cnt: high ber events
*/
struct gaudi_nic_status {
__u32 port;
@@ -69,6 +74,8 @@ struct gaudi_nic_status {
__u8 pcs_link;
__u8 phy_ready;
__u8 auto_neg;
+ __u32 timeout_retransmission_cnt;
+ __u32 high_ber_cnt;
};
struct gaudi_flops_2_data {
--
2.25.1
^ permalink raw reply related [flat|nested] 6+ messages in thread
end of thread, other threads:[~2021-05-31 16:59 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-31 15:13 [PATCH 1/6] habanalabs/gaudi: set the correct cpu_id on MME2_QM failure Oded Gabbay
2021-05-31 15:13 ` [PATCH 2/6] habanalabs/gaudi: don't use nic_ports_mask in compute Oded Gabbay
2021-05-31 15:14 ` [PATCH 3/6] habanalabs/gaudi: add ARB to QM stop on error masks Oded Gabbay
2021-05-31 15:14 ` [PATCH 4/6] habanalabs: prefer ASYNC device probing Oded Gabbay
2021-05-31 15:14 ` [PATCH 5/6] habanalabs/gaudi: split host irq interfaces towards FW Oded Gabbay
2021-05-31 15:14 ` [PATCH 6/6] habanalabs/gaudi: update to latest f/w specs Oded Gabbay
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).