All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/6] habanalabs/gaudi: don't use disabled ports in collective wait
@ 2021-06-06  8:23 Oded Gabbay
  2021-06-06  8:23 ` [PATCH 2/6] habanalabs/gaudi: add FW alive event support Oded Gabbay
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Oded Gabbay @ 2021-06-06  8:23 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

In the collective wait, we put jobs on the QMANs of all the NICs. The
code takes into account if a port is disabled only in case of PCI card.
When this info arrives from the f/w, the code doesn't take it into
account, and it tries to schedule jobs on NICs that aren't enabled and
thats a bug.

To fix this, after the f/w sends us the list of disabled ports, we
update the state of the QMANs according to that list. In addition,
we need to update the HW_CAP bits so the collective wait operation
will not try to use those QMANs. We also need to update the collective
master monitor mask.

Moreover, we need to add a protection for such future cases and in case
the user will try to submit work to those QMANs.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 219 +++++++++-----------------
 1 file changed, 71 insertions(+), 148 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 007248946b63..68b1187c9c1f 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -1000,9 +1000,27 @@ static void gaudi_sob_group_reset_error(struct kref *ref)
 		hw_sob_group->base_sob_id);
 }
 
+static void gaudi_collective_mstr_sob_mask_set(struct gaudi_device *gaudi)
+{
+	struct gaudi_collective_properties *prop;
+	int i;
+
+	prop = &gaudi->collective_props;
+
+	memset(prop->mstr_sob_mask, 0, sizeof(prop->mstr_sob_mask));
+
+	for (i = 0 ; i < NIC_NUMBER_OF_ENGINES ; i++)
+		if (gaudi->hw_cap_initialized & BIT(HW_CAP_NIC_SHIFT + i))
+			prop->mstr_sob_mask[i / HL_MAX_SOBS_PER_MONITOR] |=
+					BIT(i % HL_MAX_SOBS_PER_MONITOR);
+	/* Set collective engine bit */
+	prop->mstr_sob_mask[i / HL_MAX_SOBS_PER_MONITOR] |=
+				BIT(i % HL_MAX_SOBS_PER_MONITOR);
+}
+
 static int gaudi_collective_init(struct hl_device *hdev)
 {
-	u32 i, master_monitor_sobs, sob_id, reserved_sobs_per_group;
+	u32 i, sob_id, reserved_sobs_per_group;
 	struct gaudi_collective_properties *prop;
 	struct gaudi_device *gaudi;
 
@@ -1028,22 +1046,7 @@ static int gaudi_collective_init(struct hl_device *hdev)
 		gaudi_collective_map_sobs(hdev, i);
 	}
 
-	prop->mstr_sob_mask[0] = 0;
-	master_monitor_sobs = HL_MAX_SOBS_PER_MONITOR;
-	for (i = 0 ; i < master_monitor_sobs ; i++)
-		if (gaudi->hw_cap_initialized & BIT(HW_CAP_NIC_SHIFT + i))
-			prop->mstr_sob_mask[0] |= BIT(i);
-
-	prop->mstr_sob_mask[1] = 0;
-	master_monitor_sobs =
-		NIC_NUMBER_OF_ENGINES - HL_MAX_SOBS_PER_MONITOR;
-	for (i = 0 ; i < master_monitor_sobs; i++) {
-		if (gaudi->hw_cap_initialized & BIT(HW_CAP_NIC_SHIFT + i))
-			prop->mstr_sob_mask[1] |= BIT(i);
-	}
-
-	/* Set collective engine bit */
-	prop->mstr_sob_mask[1] |= BIT(i);
+	gaudi_collective_mstr_sob_mask_set(gaudi);
 
 	return 0;
 }
@@ -4272,8 +4275,8 @@ static void gaudi_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
 			&hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs;
 	u32 db_reg_offset, db_value, dma_qm_offset, q_off, irq_handler_offset;
 	struct gaudi_device *gaudi = hdev->asic_specific;
-	int dma_id;
 	bool invalid_queue = false;
+	int dma_id;
 
 	switch (hw_queue_id) {
 	case GAUDI_QUEUE_ID_DMA_0_0...GAUDI_QUEUE_ID_DMA_0_3:
@@ -4499,164 +4502,84 @@ static void gaudi_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
 		db_reg_offset = mmTPC7_QM_PQ_PI_3;
 		break;
 
-	case GAUDI_QUEUE_ID_NIC_0_0:
-		db_reg_offset = mmNIC0_QM0_PQ_PI_0;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_0_1:
-		db_reg_offset = mmNIC0_QM0_PQ_PI_1;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_0_2:
-		db_reg_offset = mmNIC0_QM0_PQ_PI_2;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_0_3:
-		db_reg_offset = mmNIC0_QM0_PQ_PI_3;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_1_0:
-		db_reg_offset = mmNIC0_QM1_PQ_PI_0;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_1_1:
-		db_reg_offset = mmNIC0_QM1_PQ_PI_1;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_1_2:
-		db_reg_offset = mmNIC0_QM1_PQ_PI_2;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_1_3:
-		db_reg_offset = mmNIC0_QM1_PQ_PI_3;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_2_0:
-		db_reg_offset = mmNIC1_QM0_PQ_PI_0;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_2_1:
-		db_reg_offset = mmNIC1_QM0_PQ_PI_1;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_2_2:
-		db_reg_offset = mmNIC1_QM0_PQ_PI_2;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_2_3:
-		db_reg_offset = mmNIC1_QM0_PQ_PI_3;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_3_0:
-		db_reg_offset = mmNIC1_QM1_PQ_PI_0;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_3_1:
-		db_reg_offset = mmNIC1_QM1_PQ_PI_1;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_3_2:
-		db_reg_offset = mmNIC1_QM1_PQ_PI_2;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_3_3:
-		db_reg_offset = mmNIC1_QM1_PQ_PI_3;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_4_0:
-		db_reg_offset = mmNIC2_QM0_PQ_PI_0;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_4_1:
-		db_reg_offset = mmNIC2_QM0_PQ_PI_1;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_4_2:
-		db_reg_offset = mmNIC2_QM0_PQ_PI_2;
-		break;
+	case GAUDI_QUEUE_ID_NIC_0_0...GAUDI_QUEUE_ID_NIC_0_3:
+		if (!(gaudi->hw_cap_initialized & HW_CAP_NIC0))
+			invalid_queue = true;
 
-	case GAUDI_QUEUE_ID_NIC_4_3:
-		db_reg_offset = mmNIC2_QM0_PQ_PI_3;
+		q_off = ((hw_queue_id - 1) & 0x3) * 4;
+		db_reg_offset = mmNIC0_QM0_PQ_PI_0 + q_off;
 		break;
 
-	case GAUDI_QUEUE_ID_NIC_5_0:
-		db_reg_offset = mmNIC2_QM1_PQ_PI_0;
-		break;
+	case GAUDI_QUEUE_ID_NIC_1_0...GAUDI_QUEUE_ID_NIC_1_3:
+		if (!(gaudi->hw_cap_initialized & HW_CAP_NIC1))
+			invalid_queue = true;
 
-	case GAUDI_QUEUE_ID_NIC_5_1:
-		db_reg_offset = mmNIC2_QM1_PQ_PI_1;
+		q_off = ((hw_queue_id - 1) & 0x3) * 4;
+		db_reg_offset = mmNIC0_QM1_PQ_PI_0 + q_off;
 		break;
 
-	case GAUDI_QUEUE_ID_NIC_5_2:
-		db_reg_offset = mmNIC2_QM1_PQ_PI_2;
-		break;
+	case GAUDI_QUEUE_ID_NIC_2_0...GAUDI_QUEUE_ID_NIC_2_3:
+		if (!(gaudi->hw_cap_initialized & HW_CAP_NIC2))
+			invalid_queue = true;
 
-	case GAUDI_QUEUE_ID_NIC_5_3:
-		db_reg_offset = mmNIC2_QM1_PQ_PI_3;
+		q_off = ((hw_queue_id - 1) & 0x3) * 4;
+		db_reg_offset = mmNIC1_QM0_PQ_PI_0 + q_off;
 		break;
 
-	case GAUDI_QUEUE_ID_NIC_6_0:
-		db_reg_offset = mmNIC3_QM0_PQ_PI_0;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_6_1:
-		db_reg_offset = mmNIC3_QM0_PQ_PI_1;
-		break;
+	case GAUDI_QUEUE_ID_NIC_3_0...GAUDI_QUEUE_ID_NIC_3_3:
+		if (!(gaudi->hw_cap_initialized & HW_CAP_NIC3))
+			invalid_queue = true;
 
-	case GAUDI_QUEUE_ID_NIC_6_2:
-		db_reg_offset = mmNIC3_QM0_PQ_PI_2;
+		q_off = ((hw_queue_id - 1) & 0x3) * 4;
+		db_reg_offset = mmNIC1_QM1_PQ_PI_0 + q_off;
 		break;
 
-	case GAUDI_QUEUE_ID_NIC_6_3:
-		db_reg_offset = mmNIC3_QM0_PQ_PI_3;
-		break;
+	case GAUDI_QUEUE_ID_NIC_4_0...GAUDI_QUEUE_ID_NIC_4_3:
+		if (!(gaudi->hw_cap_initialized & HW_CAP_NIC4))
+			invalid_queue = true;
 
-	case GAUDI_QUEUE_ID_NIC_7_0:
-		db_reg_offset = mmNIC3_QM1_PQ_PI_0;
+		q_off = ((hw_queue_id - 1) & 0x3) * 4;
+		db_reg_offset = mmNIC2_QM0_PQ_PI_0 + q_off;
 		break;
 
-	case GAUDI_QUEUE_ID_NIC_7_1:
-		db_reg_offset = mmNIC3_QM1_PQ_PI_1;
-		break;
+	case GAUDI_QUEUE_ID_NIC_5_0...GAUDI_QUEUE_ID_NIC_5_3:
+		if (!(gaudi->hw_cap_initialized & HW_CAP_NIC5))
+			invalid_queue = true;
 
-	case GAUDI_QUEUE_ID_NIC_7_2:
-		db_reg_offset = mmNIC3_QM1_PQ_PI_2;
+		q_off = ((hw_queue_id - 1) & 0x3) * 4;
+		db_reg_offset = mmNIC2_QM1_PQ_PI_0 + q_off;
 		break;
 
-	case GAUDI_QUEUE_ID_NIC_7_3:
-		db_reg_offset = mmNIC3_QM1_PQ_PI_3;
-		break;
-
-	case GAUDI_QUEUE_ID_NIC_8_0:
-		db_reg_offset = mmNIC4_QM0_PQ_PI_0;
-		break;
+	case GAUDI_QUEUE_ID_NIC_6_0...GAUDI_QUEUE_ID_NIC_6_3:
+		if (!(gaudi->hw_cap_initialized & HW_CAP_NIC6))
+			invalid_queue = true;
 
-	case GAUDI_QUEUE_ID_NIC_8_1:
-		db_reg_offset = mmNIC4_QM0_PQ_PI_1;
+		q_off = ((hw_queue_id - 1) & 0x3) * 4;
+		db_reg_offset = mmNIC3_QM0_PQ_PI_0 + q_off;
 		break;
 
-	case GAUDI_QUEUE_ID_NIC_8_2:
-		db_reg_offset = mmNIC4_QM0_PQ_PI_2;
-		break;
+	case GAUDI_QUEUE_ID_NIC_7_0...GAUDI_QUEUE_ID_NIC_7_3:
+		if (!(gaudi->hw_cap_initialized & HW_CAP_NIC7))
+			invalid_queue = true;
 
-	case GAUDI_QUEUE_ID_NIC_8_3:
-		db_reg_offset = mmNIC4_QM0_PQ_PI_3;
+		q_off = ((hw_queue_id - 1) & 0x3) * 4;
+		db_reg_offset = mmNIC3_QM1_PQ_PI_0 + q_off;
 		break;
 
-	case GAUDI_QUEUE_ID_NIC_9_0:
-		db_reg_offset = mmNIC4_QM1_PQ_PI_0;
-		break;
+	case GAUDI_QUEUE_ID_NIC_8_0...GAUDI_QUEUE_ID_NIC_8_3:
+		if (!(gaudi->hw_cap_initialized & HW_CAP_NIC8))
+			invalid_queue = true;
 
-	case GAUDI_QUEUE_ID_NIC_9_1:
-		db_reg_offset = mmNIC4_QM1_PQ_PI_1;
+		q_off = ((hw_queue_id - 1) & 0x3) * 4;
+		db_reg_offset = mmNIC4_QM0_PQ_PI_0 + q_off;
 		break;
 
-	case GAUDI_QUEUE_ID_NIC_9_2:
-		db_reg_offset = mmNIC4_QM1_PQ_PI_2;
-		break;
+	case GAUDI_QUEUE_ID_NIC_9_0...GAUDI_QUEUE_ID_NIC_9_3:
+		if (!(gaudi->hw_cap_initialized & HW_CAP_NIC9))
+			invalid_queue = true;
 
-	case GAUDI_QUEUE_ID_NIC_9_3:
-		db_reg_offset = mmNIC4_QM1_PQ_PI_3;
+		q_off = ((hw_queue_id - 1) & 0x3) * 4;
+		db_reg_offset = mmNIC4_QM1_PQ_PI_0 + q_off;
 		break;
 
 	default:
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-06-06  8:23 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-06  8:23 [PATCH 1/6] habanalabs/gaudi: don't use disabled ports in collective wait Oded Gabbay
2021-06-06  8:23 ` [PATCH 2/6] habanalabs/gaudi: add FW alive event support Oded Gabbay
2021-06-06  8:23 ` [PATCH 3/6] habanalabs: add debug flag to prevent failure on timeout Oded Gabbay
2021-06-06  8:23 ` [PATCH 4/6] habanalabs: reset device upon FD close if not idle Oded Gabbay
2021-06-06  8:23 ` [PATCH 5/6] habanalabs: skip valid test for boot_dev_sts regs Oded Gabbay
2021-06-06  8:23 ` [PATCH 6/6] habanalabs: fix mask to obtain page offset Oded Gabbay

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.