All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support
@ 2022-11-17 16:19 Oded Gabbay
  2022-11-17 16:19 ` [PATCH 02/20] habanalabs/gaudi: add razwi notify event Oded Gabbay
                   ` (18 more replies)
  0 siblings, 19 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Add support for Gaudi2 Device with PCI revision 2.
Functionality is exactly the same as revision 1, the only difference
is device name exposed to user.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c       |  4 +++
 drivers/misc/habanalabs/common/habanalabs.h   |  2 ++
 .../misc/habanalabs/common/habanalabs_drv.c   | 26 +++++++++++++------
 .../misc/habanalabs/common/habanalabs_ioctl.c |  6 +++--
 drivers/misc/habanalabs/common/mmu/mmu.c      |  1 +
 drivers/misc/habanalabs/common/sysfs.c        |  2 ++
 drivers/misc/habanalabs/gaudi2/gaudi2.c       |  6 +----
 drivers/misc/habanalabs/gaudi2/gaudi2P.h      |  2 --
 .../include/hw_ip/pci/pci_general.h           |  7 +++++
 include/uapi/misc/habanalabs.h                |  7 +++++
 10 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 3ea1ee1ec8ef..35ed494fcfdf 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -748,6 +748,10 @@ static int device_early_init(struct hl_device *hdev)
 		gaudi2_set_asic_funcs(hdev);
 		strscpy(hdev->asic_name, "GAUDI2", sizeof(hdev->asic_name));
 		break;
+	case ASIC_GAUDI2B:
+		gaudi2_set_asic_funcs(hdev);
+		strscpy(hdev->asic_name, "GAUDI2B", sizeof(hdev->asic_name));
+		break;
 		break;
 	default:
 		dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 7d191f388953..e391e7951fb7 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1192,6 +1192,7 @@ struct hl_dec {
  * @ASIC_GAUDI: Gaudi device (HL-2000).
  * @ASIC_GAUDI_SEC: Gaudi secured device (HL-2000).
  * @ASIC_GAUDI2: Gaudi2 device.
+ * @ASIC_GAUDI2B: Gaudi2B device.
  */
 enum hl_asic_type {
 	ASIC_INVALID,
@@ -1199,6 +1200,7 @@ enum hl_asic_type {
 	ASIC_GAUDI,
 	ASIC_GAUDI_SEC,
 	ASIC_GAUDI2,
+	ASIC_GAUDI2B,
 };
 
 struct hl_cs_parser;
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index e82af8989700..7815c60df54e 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -9,6 +9,7 @@
 #define pr_fmt(fmt)		"habanalabs: " fmt
 
 #include "habanalabs.h"
+#include "../include/hw_ip/pci/pci_general.h"
 
 #include <linux/pci.h>
 #include <linux/aer.h>
@@ -74,16 +75,17 @@ MODULE_DEVICE_TABLE(pci, ids);
 /*
  * get_asic_type - translate device id to asic type
  *
- * @device: id of the PCI device
+ * @hdev: pointer to habanalabs device structure.
  *
- * Translate device id to asic type.
+ * Translate device id and revision id to asic type.
  * In case of unidentified device, return -1
  */
-static enum hl_asic_type get_asic_type(u16 device)
+static enum hl_asic_type get_asic_type(struct hl_device *hdev)
 {
-	enum hl_asic_type asic_type;
+	struct pci_dev *pdev = hdev->pdev;
+	enum hl_asic_type asic_type = ASIC_INVALID;
 
-	switch (device) {
+	switch (pdev->device) {
 	case PCI_IDS_GOYA:
 		asic_type = ASIC_GOYA;
 		break;
@@ -94,10 +96,18 @@ static enum hl_asic_type get_asic_type(u16 device)
 		asic_type = ASIC_GAUDI_SEC;
 		break;
 	case PCI_IDS_GAUDI2:
-		asic_type = ASIC_GAUDI2;
+		switch (pdev->revision) {
+		case REV_ID_A:
+			asic_type = ASIC_GAUDI2;
+			break;
+		case REV_ID_B:
+			asic_type = ASIC_GAUDI2B;
+			break;
+		default:
+			break;
+		}
 		break;
 	default:
-		asic_type = ASIC_INVALID;
 		break;
 	}
 
@@ -416,7 +426,7 @@ static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
 	/* First, we must find out which ASIC are we handling. This is needed
 	 * to configure the behavior of the driver (kernel parameters)
 	 */
-	hdev->asic_type = get_asic_type(pdev->device);
+	hdev->asic_type = get_asic_type(hdev);
 	if (hdev->asic_type == ASIC_INVALID) {
 		dev_err(&pdev->dev, "Unsupported ASIC\n");
 		rc = -ENODEV;
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index 5ce5c42e2731..ee43017eb563 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -10,10 +10,11 @@
 #include <uapi/misc/habanalabs.h>
 #include "habanalabs.h"
 
-#include <linux/kernel.h>
 #include <linux/fs.h>
-#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 
 static u32 hl_debug_struct_size[HL_DEBUG_OP_TIMESTAMP + 1] = {
@@ -105,6 +106,7 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 	hw_ip.edma_enabled_mask = prop->edma_enabled_mask;
 	hw_ip.server_type = prop->server_type;
 	hw_ip.security_enabled = prop->fw_security_enabled;
+	hw_ip.revision_id = hdev->pdev->revision;
 
 	return copy_to_user(out, &hw_ip,
 		min((size_t) size, sizeof(hw_ip))) ? -EFAULT : 0;
diff --git a/drivers/misc/habanalabs/common/mmu/mmu.c b/drivers/misc/habanalabs/common/mmu/mmu.c
index 67d3e70cf571..2c1005f74cf4 100644
--- a/drivers/misc/habanalabs/common/mmu/mmu.c
+++ b/drivers/misc/habanalabs/common/mmu/mmu.c
@@ -635,6 +635,7 @@ int hl_mmu_if_set_funcs(struct hl_device *hdev)
 		hl_mmu_v1_set_funcs(hdev, &hdev->mmu_func[MMU_DR_PGT]);
 		break;
 	case ASIC_GAUDI2:
+	case ASIC_GAUDI2B:
 		/* MMUs in Gaudi2 are always host resident */
 		hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]);
 		break;
diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c
index c924fc994bd9..735d8bed0066 100644
--- a/drivers/misc/habanalabs/common/sysfs.c
+++ b/drivers/misc/habanalabs/common/sysfs.c
@@ -248,6 +248,8 @@ static ssize_t device_type_show(struct device *dev,
 	case ASIC_GAUDI2:
 		str = "GAUDI2";
 		break;
+	case ASIC_GAUDI2B:
+		str = "GAUDI2B";
 		break;
 	default:
 		dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 03f8cf9bb136..f21b68be6d20 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -3968,11 +3968,7 @@ static void gaudi2_init_firmware_loader(struct hl_device *hdev)
 	fw_loader->skip_bmc = false;
 	fw_loader->sram_bar_id = SRAM_CFG_BAR_ID;
 	fw_loader->dram_bar_id = DRAM_BAR_ID;
-
-	if (hdev->asic_type == ASIC_GAUDI2)
-		fw_loader->cpu_timeout = GAUDI2_CPU_TIMEOUT_USEC;
-	else /* ASIC_GAUDI2_FPGA */
-		fw_loader->cpu_timeout = GAUDI2_FPGA_CPU_TIMEOUT;
+	fw_loader->cpu_timeout = GAUDI2_CPU_TIMEOUT_USEC;
 
 	/* here we update initial values for few specific dynamic regs (as
 	 * before reading the first descriptor from FW those value has to be
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2P.h b/drivers/misc/habanalabs/gaudi2/gaudi2P.h
index a99c348bbf39..b4383c199bbb 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2P.h
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2P.h
@@ -23,8 +23,6 @@
 
 #define GAUDI2_CPU_TIMEOUT_USEC		30000000	/* 30s */
 
-#define GAUDI2_FPGA_CPU_TIMEOUT		100000000	/* 100s */
-
 #define NUMBER_OF_PDMA_QUEUES		2
 #define NUMBER_OF_EDMA_QUEUES		8
 #define NUMBER_OF_MME_QUEUES		4
diff --git a/drivers/misc/habanalabs/include/hw_ip/pci/pci_general.h b/drivers/misc/habanalabs/include/hw_ip/pci/pci_general.h
index d232081d4e0f..f5d497dc9bdc 100644
--- a/drivers/misc/habanalabs/include/hw_ip/pci/pci_general.h
+++ b/drivers/misc/habanalabs/include/hw_ip/pci/pci_general.h
@@ -20,4 +20,11 @@
 #define PCI_CONFIG_ELBI_STS_MASK	(PCI_CONFIG_ELBI_STS_ERR | \
 					PCI_CONFIG_ELBI_STS_DONE)
 
+enum hl_revision_id {
+	/* PCI revision ID 0 is not legal */
+	REV_ID_INVALID				= 0x00,
+	REV_ID_A				= 0x01,
+	REV_ID_B				= 0x02,
+};
+
 #endif /* INCLUDE_PCI_GENERAL_H_ */
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index a4ceee681898..58343998bd63 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -868,6 +868,7 @@ enum hl_server_type {
  * @number_of_user_interrupts: The number of interrupts that are available to the userspace
  *                             application to use. Relevant for Gaudi2 and later.
  * @device_mem_alloc_default_page_size: default page size used in device memory allocation.
+ * @revision_id: PCI revision ID of the ASIC.
  */
 struct hl_info_hw_ip_info {
 	__u64 sram_base_address;
@@ -898,6 +899,12 @@ struct hl_info_hw_ip_info {
 	__u16 pad2;
 	__u64 reserved4;
 	__u64 device_mem_alloc_default_page_size;
+	__u64 reserved5;
+	__u64 reserved6;
+	__u32 reserved7;
+	__u8 reserved8;
+	__u8 revision_id;
+	__u8 pad[2];
 };
 
 struct hl_info_dram_usage {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 02/20] habanalabs/gaudi: add razwi notify event
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 03/20] habanalabs: use single threaded WQ for event handling Oded Gabbay
                   ` (17 subsequent siblings)
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

Each time razwi (read-only zero, write ignore) happens, besides
capturing its data, also notify the user about it.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c     |  8 +++++
 drivers/misc/habanalabs/common/habanalabs.h |  2 ++
 drivers/misc/habanalabs/gaudi/gaudi.c       | 37 +++++++++++----------
 include/uapi/misc/habanalabs.h              |  2 ++
 4 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 35ed494fcfdf..d1a609589558 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -2409,6 +2409,14 @@ void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_
 			num_of_engines * sizeof(u16));
 	hdev->captured_err_info.razwi.flags = flags;
 }
+
+void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
+			u8 flags, u64 *event_mask)
+{
+	hl_capture_razwi(hdev, addr, engine_id, num_of_engines, flags);
+	*event_mask |= HL_NOTIFIER_EVENT_RAZWI;
+}
+
 static void hl_capture_user_mappings(struct hl_device *hdev, bool is_pmmu)
 {
 	struct page_fault_info *pgf_info = &hdev->captured_err_info.pgf_info;
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index e391e7951fb7..d9335f3769b8 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -3812,6 +3812,8 @@ hl_mmap_mem_buf_alloc(struct hl_mem_mgr *mmg,
 __printf(2, 3) void hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...);
 void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
 			u8 flags);
+void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
+			u8 flags, u64 *event_mask);
 void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu);
 
 #ifdef CONFIG_DEBUG_FS
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 3dfb9ecf7db3..035865cb097c 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7301,7 +7301,7 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e
 }
 
 static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
-					bool razwi)
+					bool razwi, u64 *event_mask)
 {
 	bool is_read = false, is_write = false;
 	u16 engine_id[2], num_of_razwi_eng = 0;
@@ -7337,7 +7337,8 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
 				num_of_razwi_eng = 1;
 		}
 
-		hl_capture_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags);
+		hl_handle_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags,
+				event_mask);
 	}
 }
 
@@ -7675,7 +7676,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
 	case GAUDI_EVENT_HBM_0_DERR ... GAUDI_EVENT_HBM_3_DERR:
 	case GAUDI_EVENT_MMU_DERR:
 	case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR:
-		gaudi_print_irq_info(hdev, event_type, true);
+		gaudi_print_irq_info(hdev, event_type, true, &event_mask);
 		gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
 		fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
@@ -7685,7 +7686,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
 	case GAUDI_EVENT_AXI_ECC:
 	case GAUDI_EVENT_L2_RAM_ECC:
 	case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17:
-		gaudi_print_irq_info(hdev, event_type, false);
+		gaudi_print_irq_info(hdev, event_type, false, &event_mask);
 		fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
 		goto reset_device;
@@ -7694,7 +7695,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
 	case GAUDI_EVENT_HBM1_SPI_0:
 	case GAUDI_EVENT_HBM2_SPI_0:
 	case GAUDI_EVENT_HBM3_SPI_0:
-		gaudi_print_irq_info(hdev, event_type, false);
+		gaudi_print_irq_info(hdev, event_type, false, &event_mask);
 		gaudi_hbm_read_interrupts(hdev,
 				gaudi_hbm_event_to_dev(event_type),
 				&eq_entry->hbm_ecc_data);
@@ -7706,7 +7707,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
 	case GAUDI_EVENT_HBM1_SPI_1:
 	case GAUDI_EVENT_HBM2_SPI_1:
 	case GAUDI_EVENT_HBM3_SPI_1:
-		gaudi_print_irq_info(hdev, event_type, false);
+		gaudi_print_irq_info(hdev, event_type, false, &event_mask);
 		gaudi_hbm_read_interrupts(hdev,
 				gaudi_hbm_event_to_dev(event_type),
 				&eq_entry->hbm_ecc_data);
@@ -7728,7 +7729,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
 		 * if the event is a TPC Assertion or a "real" TPC DEC.
 		 */
 		event_mask |= HL_NOTIFIER_EVENT_TPC_ASSERT;
-		gaudi_print_irq_info(hdev, event_type, true);
+		gaudi_print_irq_info(hdev, event_type, true, &event_mask);
 		reset_required = gaudi_tpc_read_interrupts(hdev,
 					tpc_dec_event_to_tpc_id(event_type),
 					"AXI_SLV_DEC_Error");
@@ -7753,7 +7754,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
 	case GAUDI_EVENT_TPC5_KRN_ERR:
 	case GAUDI_EVENT_TPC6_KRN_ERR:
 	case GAUDI_EVENT_TPC7_KRN_ERR:
-		gaudi_print_irq_info(hdev, event_type, true);
+		gaudi_print_irq_info(hdev, event_type, true, &event_mask);
 		reset_required = gaudi_tpc_read_interrupts(hdev,
 					tpc_krn_event_to_tpc_id(event_type),
 					"KRN_ERR");
@@ -7792,7 +7793,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
 	case GAUDI_EVENT_HBM_0_SERR ... GAUDI_EVENT_HBM_3_SERR:
 		fallthrough;
 	case GAUDI_EVENT_MMU_SERR:
-		gaudi_print_irq_info(hdev, event_type, true);
+		gaudi_print_irq_info(hdev, event_type, true, &event_mask);
 		gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
 		hl_fw_unmask_irq(hdev, event_type);
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
@@ -7802,14 +7803,14 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
 	case GAUDI_EVENT_CPU_AXI_SPLITTER:
 	case GAUDI_EVENT_PSOC_AXI_DEC:
 	case GAUDI_EVENT_PSOC_PRSTN_FALL:
-		gaudi_print_irq_info(hdev, event_type, true);
+		gaudi_print_irq_info(hdev, event_type, true, &event_mask);
 		hl_fw_unmask_irq(hdev, event_type);
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
 		break;
 
 	case GAUDI_EVENT_MMU_PAGE_FAULT:
 	case GAUDI_EVENT_MMU_WR_PERM:
-		gaudi_print_irq_info(hdev, event_type, true);
+		gaudi_print_irq_info(hdev, event_type, true, &event_mask);
 		hl_fw_unmask_irq(hdev, event_type);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
@@ -7838,14 +7839,14 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
 	case GAUDI_EVENT_NIC4_QM1:
 	case GAUDI_EVENT_DMA0_CORE ... GAUDI_EVENT_DMA7_CORE:
 	case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM:
-		gaudi_print_irq_info(hdev, event_type, true);
+		gaudi_print_irq_info(hdev, event_type, true, &event_mask);
 		gaudi_handle_qman_err(hdev, event_type, &event_mask);
 		hl_fw_unmask_irq(hdev, event_type);
 		event_mask |= (HL_NOTIFIER_EVENT_USER_ENGINE_ERR | HL_NOTIFIER_EVENT_DEVICE_RESET);
 		break;
 
 	case GAUDI_EVENT_RAZWI_OR_ADC_SW:
-		gaudi_print_irq_info(hdev, event_type, true);
+		gaudi_print_irq_info(hdev, event_type, true, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		goto reset_device;
 
@@ -7858,7 +7859,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
 	case GAUDI_EVENT_TPC6_BMON_SPMU:
 	case GAUDI_EVENT_TPC7_BMON_SPMU:
 	case GAUDI_EVENT_DMA_BM_CH0 ... GAUDI_EVENT_DMA_BM_CH7:
-		gaudi_print_irq_info(hdev, event_type, false);
+		gaudi_print_irq_info(hdev, event_type, false, &event_mask);
 		hl_fw_unmask_irq(hdev, event_type);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
@@ -7870,7 +7871,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
 		break;
 
 	case GAUDI_EVENT_DMA_IF_SEI_0 ... GAUDI_EVENT_DMA_IF_SEI_3:
-		gaudi_print_irq_info(hdev, event_type, false);
+		gaudi_print_irq_info(hdev, event_type, false, &event_mask);
 		gaudi_print_sm_sei_info(hdev, event_type,
 					&eq_entry->sm_sei_data);
 		rc = hl_state_dump(hdev);
@@ -7899,18 +7900,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
 		break;
 
 	case GAUDI_EVENT_DEV_RESET_REQ:
-		gaudi_print_irq_info(hdev, event_type, false);
+		gaudi_print_irq_info(hdev, event_type, false, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
 		goto reset_device;
 
 	case GAUDI_EVENT_PKT_QUEUE_OUT_SYNC:
-		gaudi_print_irq_info(hdev, event_type, false);
+		gaudi_print_irq_info(hdev, event_type, false, &event_mask);
 		gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
 		goto reset_device;
 
 	case GAUDI_EVENT_FW_ALIVE_S:
-		gaudi_print_irq_info(hdev, event_type, false);
+		gaudi_print_irq_info(hdev, event_type, false, &event_mask);
 		gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive);
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
 		goto reset_device;
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 58343998bd63..7747e19e81fe 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -721,6 +721,7 @@ enum hl_server_type {
  * HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE	- Indicates device is unavailable
  * HL_NOTIFIER_EVENT_USER_ENGINE_ERR	- Indicates device engine in error state
  * HL_NOTIFIER_EVENT_GENERAL_HW_ERR     - Indicates device HW error
+ * HL_NOTIFIER_EVENT_RAZWI              - Indicates razwi happened
  */
 #define HL_NOTIFIER_EVENT_TPC_ASSERT		(1ULL << 0)
 #define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE	(1ULL << 1)
@@ -729,6 +730,7 @@ enum hl_server_type {
 #define HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE	(1ULL << 4)
 #define HL_NOTIFIER_EVENT_USER_ENGINE_ERR	(1ULL << 5)
 #define HL_NOTIFIER_EVENT_GENERAL_HW_ERR	(1ULL << 6)
+#define HL_NOTIFIER_EVENT_RAZWI			(1ULL << 7)
 
 /* Opcode for management ioctl
  *
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 03/20] habanalabs: use single threaded WQ for event handling
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
  2022-11-17 16:19 ` [PATCH 02/20] habanalabs/gaudi: add razwi notify event Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 04/20] habanalabs/gaudi: add page fault notify event Oded Gabbay
                   ` (16 subsequent siblings)
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

Creating event queue workqueue using alloc_workqueue made it run in
multi threaded mode, which caused parallel dumping of events as well as
parallel events notifying to user, causing logs with multiple
events to be out of order.

Fixed by creating event queue workqueue as single threaded work queue.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index d1a609589558..65bb40f81901 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -787,7 +787,7 @@ static int device_early_init(struct hl_device *hdev)
 		}
 	}
 
-	hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0);
+	hdev->eq_wq = create_singlethread_workqueue("hl-events");
 	if (hdev->eq_wq == NULL) {
 		dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
 		rc = -ENOMEM;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 04/20] habanalabs/gaudi: add page fault notify event
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
  2022-11-17 16:19 ` [PATCH 02/20] habanalabs/gaudi: add razwi notify event Oded Gabbay
  2022-11-17 16:19 ` [PATCH 03/20] habanalabs: use single threaded WQ for event handling Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 05/20] habanalabs/gaudi2: implement fp32 not supported event Oded Gabbay
                   ` (15 subsequent siblings)
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

Each time page fault happens, besides capturing its data, also notify
the user about it.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c     | 9 +++++++++
 drivers/misc/habanalabs/common/habanalabs.h | 2 ++
 drivers/misc/habanalabs/gaudi/gaudi.c       | 6 +++---
 include/uapi/misc/habanalabs.h              | 2 ++
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 65bb40f81901..31818121ef4d 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -2490,3 +2490,12 @@ void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is
 	hdev->captured_err_info.pgf_info.pgf.engine_id = eng_id;
 	hl_capture_user_mappings(hdev, is_pmmu);
 }
+
+void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu,
+				u64 *event_mask)
+{
+	hl_capture_page_fault(hdev, addr, eng_id, is_pmmu);
+
+	if (event_mask)
+		*event_mask |=  HL_NOTIFIER_EVENT_PAGE_FAULT;
+}
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index d9335f3769b8..0781b8698f74 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -3815,6 +3815,8 @@ void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_
 void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
 			u8 flags, u64 *event_mask);
 void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu);
+void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu,
+				u64 *event_mask);
 
 #ifdef CONFIG_DEBUG_FS
 
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 035865cb097c..cbe1daf5a793 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -6740,7 +6740,7 @@ static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u16 *engine_i
 	}
 }
 
-static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr)
+static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr, u64 *event_mask)
 {
 	struct gaudi_device *gaudi = hdev->asic_specific;
 	u32 val;
@@ -6755,7 +6755,7 @@ static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr
 		*addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA);
 
 		dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", *addr);
-		hl_capture_page_fault(hdev, *addr, 0, true);
+		hl_handle_page_fault(hdev, *addr, 0, true, event_mask);
 
 		WREG32(mmMMU_UP_PAGE_ERROR_CAPTURE, 0);
 	}
@@ -7323,7 +7323,7 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
 	if (razwi) {
 		gaudi_print_and_get_razwi_info(hdev, &engine_id[0], &engine_id[1], &is_read,
 						&is_write);
-		gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr);
+		gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, event_mask);
 
 		if (is_read)
 			razwi_flags |= HL_RAZWI_READ;
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 7747e19e81fe..e50cb71df081 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -722,6 +722,7 @@ enum hl_server_type {
  * HL_NOTIFIER_EVENT_USER_ENGINE_ERR	- Indicates device engine in error state
  * HL_NOTIFIER_EVENT_GENERAL_HW_ERR     - Indicates device HW error
  * HL_NOTIFIER_EVENT_RAZWI              - Indicates razwi happened
+ * HL_NOTIFIER_EVENT_PAGE_FAULT         - Indicates page fault happened
  */
 #define HL_NOTIFIER_EVENT_TPC_ASSERT		(1ULL << 0)
 #define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE	(1ULL << 1)
@@ -731,6 +732,7 @@ enum hl_server_type {
 #define HL_NOTIFIER_EVENT_USER_ENGINE_ERR	(1ULL << 5)
 #define HL_NOTIFIER_EVENT_GENERAL_HW_ERR	(1ULL << 6)
 #define HL_NOTIFIER_EVENT_RAZWI			(1ULL << 7)
+#define HL_NOTIFIER_EVENT_PAGE_FAULT		(1ULL << 8)
 
 /* Opcode for management ioctl
  *
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 05/20] habanalabs/gaudi2: implement fp32 not supported event
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
                   ` (2 preceding siblings ...)
  2022-11-17 16:19 ` [PATCH 04/20] habanalabs/gaudi: add page fault notify event Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 06/20] habanalabs/gaudi2: add razwi notify event Oded Gabbay
                   ` (14 subsequent siblings)
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

Due to binning, Gaudi2 does not always support fp32.
We add support for such an event in case fp32 is used by the user
in such a device.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c                      | 5 +++++
 drivers/misc/habanalabs/include/gaudi2/gaudi2_async_events.h | 1 +
 .../include/gaudi2/gaudi2_async_ids_map_extended.h           | 4 +++-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index f21b68be6d20..77bdbab41e6c 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -9148,6 +9148,11 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
+	case GAUDI2_EVENT_CPU_FP32_NOT_SUPPORTED:
+		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
+		is_critical = true;
+		break;
+
 	default:
 		if (gaudi2_irq_map_table[event_type].valid)
 			dev_err_ratelimited(hdev->dev, "Cannot find handler for event %d\n",
diff --git a/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_events.h b/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_events.h
index 34406770a76a..305b576222e6 100644
--- a/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_events.h
+++ b/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_events.h
@@ -957,6 +957,7 @@ enum gaudi2_async_event_id {
 	GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG0 = 1317,
 	GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG1 = 1318,
 	GAUDI2_EVENT_ARC_DCCM_FULL = 1319,
+	GAUDI2_EVENT_CPU_FP32_NOT_SUPPORTED = 1320,
 	GAUDI2_EVENT_SIZE,
 };
 
diff --git a/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h b/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h
index 5bd4383c9f2c..d510cb10c883 100644
--- a/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h
+++ b/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0
  *
- * Copyright 2018-2021 HabanaLabs, Ltd.
+ * Copyright 2018-2022 HabanaLabs, Ltd.
  * All Rights Reserved.
  *
  */
@@ -2663,6 +2663,8 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
 		.msg = 1, .reset = 0, .name = "STATUS_NIC11_ENG1" },
 	{ .fc_id = 1319, .cpu_id = 625, .valid = 1,
 		.msg = 1, .reset = 0, .name = "ARC_DCCM_FULL" },
+	{ .fc_id = 1320, .cpu_id = 626, .valid = 1,
+		.msg = 1, .reset = 1, .name = "FP32_NOT_SUPPORTED" },
 };
 
 #endif /* __GAUDI2_ASYNC_IDS_MAP_EVENTS_EXT_H_ */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 06/20] habanalabs/gaudi2: add razwi notify event
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
                   ` (3 preceding siblings ...)
  2022-11-17 16:19 ` [PATCH 05/20] habanalabs/gaudi2: implement fp32 not supported event Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 07/20] habanalabs: fix firmware descriptor copy operation Oded Gabbay
                   ` (13 subsequent siblings)
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

Each time razwi (read-only zero, write ignored) event happens, besides
capturing its data, also notify the user about it.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c |   4 +-
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 140 +++++++++++++-----------
 2 files changed, 82 insertions(+), 62 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 31818121ef4d..708db0f48ee0 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -2414,7 +2414,9 @@ void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_o
 			u8 flags, u64 *event_mask)
 {
 	hl_capture_razwi(hdev, addr, engine_id, num_of_engines, flags);
-	*event_mask |= HL_NOTIFIER_EVENT_RAZWI;
+
+	if (event_mask)
+		*event_mask |= HL_NOTIFIER_EVENT_RAZWI;
 }
 
 static void hl_capture_user_mappings(struct hl_device *hdev, bool is_pmmu)
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 77bdbab41e6c..59940c8df2d2 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -7063,7 +7063,7 @@ static void gaudi2_handle_qman_err_generic(struct hl_device *hdev, const char *q
 static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev,
 			u64 rtr_mstr_if_base_addr, bool is_write, char *name,
 			bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info,
-			enum gaudi2_engine_id id)
+			enum gaudi2_engine_id id, u64 *event_mask)
 {
 	u32 razwi_hi, razwi_lo, razwi_xy;
 	u16 eng_id = id;
@@ -7093,8 +7093,8 @@ static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev,
 		rd_wr_flag = HL_RAZWI_READ;
 	}
 
-	hl_capture_razwi(hdev, (u64)razwi_hi << 32 | razwi_lo, &eng_id, 1,
-				rd_wr_flag | HL_RAZWI_HBW);
+	hl_handle_razwi(hdev, (u64)razwi_hi << 32 | razwi_lo, &eng_id, 1,
+				rd_wr_flag | HL_RAZWI_HBW, event_mask);
 
 	dev_err_ratelimited(hdev->dev,
 		"%s-RAZWI SHARED RR HBW %s error, address %#llx, Initiator coordinates 0x%x\n",
@@ -7104,7 +7104,7 @@ static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev,
 static void gaudi2_razwi_rr_lbw_shared_printf_info(struct hl_device *hdev,
 			u64 rtr_mstr_if_base_addr, bool is_write, char *name,
 			bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info,
-			enum gaudi2_engine_id id)
+			enum gaudi2_engine_id id, u64 *event_mask)
 {
 	u32 razwi_addr, razwi_xy;
 	u16 eng_id = id;
@@ -7132,7 +7132,7 @@ static void gaudi2_razwi_rr_lbw_shared_printf_info(struct hl_device *hdev,
 		rd_wr_flag = HL_RAZWI_READ;
 	}
 
-	hl_capture_razwi(hdev, razwi_addr, &eng_id, 1, rd_wr_flag | HL_RAZWI_LBW);
+	hl_handle_razwi(hdev, razwi_addr, &eng_id, 1, rd_wr_flag | HL_RAZWI_LBW, event_mask);
 	dev_err_ratelimited(hdev->dev,
 				"%s-RAZWI SHARED RR LBW %s error, mstr_if 0x%llx, captured address 0x%x Initiator coordinates 0x%x\n",
 				name, is_write ? "WR" : "RD", rtr_mstr_if_base_addr, razwi_addr,
@@ -7189,7 +7189,8 @@ static enum gaudi2_engine_id gaudi2_razwi_calc_engine_id(struct hl_device *hdev,
  */
 static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 				enum razwi_event_sources module, u8 module_idx,
-				u8 module_sub_idx, struct hl_eq_razwi_info *razwi_info)
+				u8 module_sub_idx, struct hl_eq_razwi_info *razwi_info,
+				u64 *event_mask)
 {
 	bool via_sft = false, read_razwi_regs = false;
 	u32 rtr_id, dcore_id, dcore_rtr_id, sft_id, eng_id;
@@ -7330,7 +7331,7 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 	if (hbw_shrd_aw) {
 		gaudi2_razwi_rr_hbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, true,
 						initiator_name, read_razwi_regs, razwi_info,
-						eng_id);
+						eng_id, event_mask);
 
 		/* Clear event indication */
 		if (read_razwi_regs)
@@ -7340,7 +7341,7 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 	if (hbw_shrd_ar) {
 		gaudi2_razwi_rr_hbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, false,
 						initiator_name, read_razwi_regs, razwi_info,
-						eng_id);
+						eng_id, event_mask);
 
 		/* Clear event indication */
 		if (read_razwi_regs)
@@ -7350,7 +7351,7 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 	if (lbw_shrd_aw) {
 		gaudi2_razwi_rr_lbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, true,
 						initiator_name, read_razwi_regs, razwi_info,
-						eng_id);
+						eng_id, event_mask);
 
 		/* Clear event indication */
 		if (read_razwi_regs)
@@ -7360,7 +7361,7 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 	if (lbw_shrd_ar) {
 		gaudi2_razwi_rr_lbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, false,
 						initiator_name, read_razwi_regs, razwi_info,
-						eng_id);
+						eng_id, event_mask);
 
 		/* Clear event indication */
 		if (read_razwi_regs)
@@ -7376,38 +7377,42 @@ static void gaudi2_check_if_razwi_happened(struct hl_device *hdev)
 	/* check all TPCs */
 	for (mod_idx = 0 ; mod_idx < (NUM_OF_TPC_PER_DCORE * NUM_OF_DCORES + 1) ; mod_idx++) {
 		if (prop->tpc_enabled_mask & BIT(mod_idx))
-			gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_TPC, mod_idx, 0, NULL);
+			gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_TPC, mod_idx, 0, NULL,
+								NULL);
 	}
 
 	/* check all MMEs */
 	for (mod_idx = 0 ; mod_idx < (NUM_OF_MME_PER_DCORE * NUM_OF_DCORES) ; mod_idx++)
 		for (sub_mod = MME_WAP0 ; sub_mod < MME_INITIATORS_MAX ; sub_mod++)
 			gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mod_idx,
-								sub_mod, NULL);
+									sub_mod, NULL, NULL);
 
 	/* check all EDMAs */
 	for (mod_idx = 0 ; mod_idx < (NUM_OF_EDMA_PER_DCORE * NUM_OF_DCORES) ; mod_idx++)
 		if (prop->edma_enabled_mask & BIT(mod_idx))
-			gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_EDMA, mod_idx, 0, NULL);
+			gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_EDMA, mod_idx, 0, NULL,
+								NULL);
 
 	/* check all PDMAs */
 	for (mod_idx = 0 ; mod_idx < NUM_OF_PDMA ; mod_idx++)
-		gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_PDMA, mod_idx, 0, NULL);
+		gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_PDMA, mod_idx, 0, NULL,
+							NULL);
 
 	/* check all NICs */
 	for (mod_idx = 0 ; mod_idx < NIC_NUMBER_OF_PORTS ; mod_idx++)
 		if (hdev->nic_ports_mask & BIT(mod_idx))
 			gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_NIC, mod_idx >> 1, 0,
-								NULL);
+								NULL, NULL);
 
 	/* check all DECs */
 	for (mod_idx = 0 ; mod_idx < NUMBER_OF_DEC ; mod_idx++)
 		if (prop->decoder_enabled_mask & BIT(mod_idx))
-			gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_DEC, mod_idx, 0, NULL);
+			gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_DEC, mod_idx, 0, NULL,
+								NULL);
 
 	/* check all ROTs */
 	for (mod_idx = 0 ; mod_idx < NUM_OF_ROT ; mod_idx++)
-		gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_ROT, mod_idx, 0, NULL);
+		gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_ROT, mod_idx, 0, NULL, NULL);
 }
 
 static const char *gaudi2_get_initiators_name(u32 rtr_id)
@@ -7625,7 +7630,8 @@ static u16 gaudi2_get_razwi_initiators(u32 rtr_id, u16 *engines)
 }
 
 static void gaudi2_razwi_unmapped_addr_hbw_printf_info(struct hl_device *hdev, u32 rtr_id,
-							u64 rtr_ctrl_base_addr, bool is_write)
+							u64 rtr_ctrl_base_addr, bool is_write,
+							u64 *event_mask)
 {
 	u16 engines[HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR], num_of_eng;
 	u32 razwi_hi, razwi_lo;
@@ -7649,8 +7655,8 @@ static void gaudi2_razwi_unmapped_addr_hbw_printf_info(struct hl_device *hdev, u
 		WREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AR_SET, 0x1);
 	}
 
-	hl_capture_razwi(hdev, (u64)razwi_hi << 32 | razwi_lo, &engines[0], num_of_eng,
-				rd_wr_flag | HL_RAZWI_HBW);
+	hl_handle_razwi(hdev, (u64)razwi_hi << 32 | razwi_lo, &engines[0], num_of_eng,
+				rd_wr_flag | HL_RAZWI_HBW, event_mask);
 	dev_err_ratelimited(hdev->dev,
 		"RAZWI PSOC unmapped HBW %s error, rtr id %u, address %#llx\n",
 		is_write ? "WR" : "RD", rtr_id, (u64)razwi_hi << 32 | razwi_lo);
@@ -7660,7 +7666,8 @@ static void gaudi2_razwi_unmapped_addr_hbw_printf_info(struct hl_device *hdev, u
 }
 
 static void gaudi2_razwi_unmapped_addr_lbw_printf_info(struct hl_device *hdev, u32 rtr_id,
-							u64 rtr_ctrl_base_addr, bool is_write)
+							u64 rtr_ctrl_base_addr, bool is_write,
+							u64 *event_mask)
 {
 	u16 engines[HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR], num_of_eng;
 	u32 razwi_addr;
@@ -7682,7 +7689,8 @@ static void gaudi2_razwi_unmapped_addr_lbw_printf_info(struct hl_device *hdev, u
 		WREG32(rtr_ctrl_base_addr + DEC_RAZWI_LBW_AR_SET, 0x1);
 	}
 
-	hl_capture_razwi(hdev, razwi_addr, &engines[0], num_of_eng, rd_wr_flag | HL_RAZWI_LBW);
+	hl_handle_razwi(hdev, razwi_addr, &engines[0], num_of_eng, rd_wr_flag | HL_RAZWI_LBW,
+			event_mask);
 	dev_err_ratelimited(hdev->dev,
 		"RAZWI PSOC unmapped LBW %s error, rtr id %u, address %#x\n",
 		is_write ? "WR" : "RD", rtr_id, razwi_addr);
@@ -7692,7 +7700,7 @@ static void gaudi2_razwi_unmapped_addr_lbw_printf_info(struct hl_device *hdev, u
 }
 
 /* PSOC RAZWI interrupt occurs only when trying to access a bad address */
-static void gaudi2_ack_psoc_razwi_event_handler(struct hl_device *hdev)
+static void gaudi2_ack_psoc_razwi_event_handler(struct hl_device *hdev, u64 *event_mask)
 {
 	u32 hbw_aw_set, hbw_ar_set, lbw_aw_set, lbw_ar_set, rtr_id, dcore_id, dcore_rtr_id, xy,
 								razwi_mask_info, razwi_intr = 0;
@@ -7746,19 +7754,19 @@ static void gaudi2_ack_psoc_razwi_event_handler(struct hl_device *hdev)
 
 	if (hbw_aw_set)
 		gaudi2_razwi_unmapped_addr_hbw_printf_info(hdev, rtr_id,
-						rtr_ctrl_base_addr, true);
+						rtr_ctrl_base_addr, true, event_mask);
 
 	if (hbw_ar_set)
 		gaudi2_razwi_unmapped_addr_hbw_printf_info(hdev, rtr_id,
-						rtr_ctrl_base_addr, false);
+						rtr_ctrl_base_addr, false, event_mask);
 
 	if (lbw_aw_set)
 		gaudi2_razwi_unmapped_addr_lbw_printf_info(hdev, rtr_id,
-						rtr_ctrl_base_addr, true);
+						rtr_ctrl_base_addr, true, event_mask);
 
 	if (lbw_ar_set)
 		gaudi2_razwi_unmapped_addr_lbw_printf_info(hdev, rtr_id,
-						rtr_ctrl_base_addr, false);
+						rtr_ctrl_base_addr, false, event_mask);
 
 clear:
 	/* Clear Interrupts only on pldm or if f/w doesn't handle interrupts */
@@ -7784,7 +7792,7 @@ static void _gaudi2_handle_qm_sei_err(struct hl_device *hdev, u64 qman_base)
 }
 
 static void gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type,
-					struct hl_eq_razwi_info *razwi_info)
+					struct hl_eq_razwi_info *razwi_info, u64 *event_mask)
 {
 	enum razwi_event_sources module;
 	u64 qman_base;
@@ -7837,7 +7845,7 @@ static void gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type,
 
 	/* check if RAZWI happened */
 	if (razwi_info)
-		gaudi2_ack_module_razwi_event_handler(hdev, module, 0, 0, razwi_info);
+		gaudi2_ack_module_razwi_event_handler(hdev, module, 0, 0, razwi_info, event_mask);
 }
 
 static void gaudi2_handle_qman_err(struct hl_device *hdev, u16 event_type)
@@ -8003,7 +8011,8 @@ static void gaudi2_handle_cpu_sei_err(struct hl_device *hdev)
 }
 
 static void gaudi2_handle_rot_err(struct hl_device *hdev, u8 rot_index,
-					struct hl_eq_razwi_with_intr_cause *razwi_with_intr_cause)
+					struct hl_eq_razwi_with_intr_cause *razwi_with_intr_cause,
+					u64 *event_mask)
 {
 	u64 intr_cause_data = le64_to_cpu(razwi_with_intr_cause->intr_cause.intr_cause_data);
 	int i;
@@ -8015,11 +8024,12 @@ static void gaudi2_handle_rot_err(struct hl_device *hdev, u8 rot_index,
 
 	/* check if RAZWI happened */
 	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_ROT, rot_index, 0,
-						&razwi_with_intr_cause->razwi_info);
+						&razwi_with_intr_cause->razwi_info, event_mask);
 }
 
 static void gaudi2_tpc_ack_interrupts(struct hl_device *hdev, u8 tpc_index, char *interrupt_name,
-					struct hl_eq_razwi_with_intr_cause *razwi_with_intr_cause)
+					struct hl_eq_razwi_with_intr_cause *razwi_with_intr_cause,
+					u64 *event_mask)
 {
 	u64 intr_cause_data = le64_to_cpu(razwi_with_intr_cause->intr_cause.intr_cause_data);
 	int i;
@@ -8031,11 +8041,11 @@ static void gaudi2_tpc_ack_interrupts(struct hl_device *hdev, u8 tpc_index, char
 
 	/* check if RAZWI happened */
 	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_TPC, tpc_index, 0,
-						&razwi_with_intr_cause->razwi_info);
+						&razwi_with_intr_cause->razwi_info, event_mask);
 }
 
 static void gaudi2_handle_dec_err(struct hl_device *hdev, u8 dec_index, const char *interrupt_name,
-				struct hl_eq_razwi_info *razwi_info)
+				struct hl_eq_razwi_info *razwi_info, u64 *event_mask)
 {
 	u32 sts_addr, sts_val, sts_clr_val = 0;
 	int i;
@@ -8061,14 +8071,15 @@ static void gaudi2_handle_dec_err(struct hl_device *hdev, u8 dec_index, const ch
 	}
 
 	/* check if RAZWI happened */
-	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_DEC, dec_index, 0, razwi_info);
+	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_DEC, dec_index, 0, razwi_info,
+						event_mask);
 
 	/* Write 1 clear errors */
 	WREG32(sts_addr, sts_clr_val);
 }
 
 static void gaudi2_handle_mme_err(struct hl_device *hdev, u8 mme_index, const char *interrupt_name,
-				struct hl_eq_razwi_info *razwi_info)
+				struct hl_eq_razwi_info *razwi_info, u64 *event_mask)
 {
 	u32 sts_addr, sts_val, sts_clr_addr, sts_clr_val = 0;
 	int i;
@@ -8088,7 +8099,8 @@ static void gaudi2_handle_mme_err(struct hl_device *hdev, u8 mme_index, const ch
 
 	/* check if RAZWI happened */
 	for (i = MME_WRITE ; i < MME_INITIATORS_MAX ; i++)
-		gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, i, razwi_info);
+		gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, i, razwi_info,
+							event_mask);
 
 	WREG32(sts_clr_addr, sts_clr_val);
 }
@@ -8105,7 +8117,7 @@ static void gaudi2_handle_mme_sbte_err(struct hl_device *hdev, u8 mme_index, u8
 }
 
 static void gaudi2_handle_mme_wap_err(struct hl_device *hdev, u8 mme_index,
-					struct hl_eq_razwi_info *razwi_info)
+					struct hl_eq_razwi_info *razwi_info, u64 *event_mask)
 {
 	u32 sts_addr, sts_val, sts_clr_addr, sts_clr_val = 0;
 	int i;
@@ -8125,8 +8137,10 @@ static void gaudi2_handle_mme_wap_err(struct hl_device *hdev, u8 mme_index,
 	}
 
 	/* check if RAZWI happened on WAP0/1 */
-	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, MME_WAP0, razwi_info);
-	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, MME_WAP1, razwi_info);
+	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, MME_WAP0, razwi_info,
+						event_mask);
+	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, MME_WAP1, razwi_info,
+						event_mask);
 
 	WREG32(sts_clr_addr, sts_clr_val);
 }
@@ -8156,40 +8170,41 @@ static void gaudi2_handle_dma_core_event(struct hl_device *hdev, u64 intr_cause_
 						gaudi2_dma_core_interrupts_cause[i]);
 }
 
-static void gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(struct hl_device *hdev)
+static void gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(struct hl_device *hdev, u64 *event_mask)
 {
 	u32 mstr_if_base_addr = mmPCIE_MSTR_RR_MSTR_IF_RR_SHRD_HBW_BASE, razwi_happened_addr;
 
 	razwi_happened_addr = mstr_if_base_addr + RR_SHRD_HBW_AW_RAZWI_HAPPENED;
 	if (RREG32(razwi_happened_addr)) {
 		gaudi2_razwi_rr_hbw_shared_printf_info(hdev, mstr_if_base_addr, true, "PCIE", true,
-							NULL, GAUDI2_ENGINE_ID_PCIE);
+							NULL, GAUDI2_ENGINE_ID_PCIE, event_mask);
 		WREG32(razwi_happened_addr, 0x1);
 	}
 
 	razwi_happened_addr = mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_HAPPENED;
 	if (RREG32(razwi_happened_addr)) {
 		gaudi2_razwi_rr_hbw_shared_printf_info(hdev, mstr_if_base_addr, false, "PCIE", true,
-							NULL, GAUDI2_ENGINE_ID_PCIE);
+							NULL, GAUDI2_ENGINE_ID_PCIE, event_mask);
 		WREG32(razwi_happened_addr, 0x1);
 	}
 
 	razwi_happened_addr = mstr_if_base_addr + RR_SHRD_LBW_AW_RAZWI_HAPPENED;
 	if (RREG32(razwi_happened_addr)) {
 		gaudi2_razwi_rr_lbw_shared_printf_info(hdev, mstr_if_base_addr, true, "PCIE", true,
-							NULL, GAUDI2_ENGINE_ID_PCIE);
+							NULL, GAUDI2_ENGINE_ID_PCIE, event_mask);
 		WREG32(razwi_happened_addr, 0x1);
 	}
 
 	razwi_happened_addr = mstr_if_base_addr + RR_SHRD_LBW_AR_RAZWI_HAPPENED;
 	if (RREG32(razwi_happened_addr)) {
 		gaudi2_razwi_rr_lbw_shared_printf_info(hdev, mstr_if_base_addr, false, "PCIE", true,
-							NULL, GAUDI2_ENGINE_ID_PCIE);
+							NULL, GAUDI2_ENGINE_ID_PCIE, event_mask);
 		WREG32(razwi_happened_addr, 0x1);
 	}
 }
 
-static void gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u64 intr_cause_data)
+static void gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u64 intr_cause_data,
+						u64 *event_mask)
 {
 	int i;
 
@@ -8204,7 +8219,7 @@ static void gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u64 intr_cau
 		case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_AXI_LBW_ERR_INTR_MASK:
 			break;
 		case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_BAD_ACCESS_INTR_MASK:
-			gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev);
+			gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev, event_mask);
 			break;
 		}
 	}
@@ -8818,29 +8833,30 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 	case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP:
 	case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP:
 		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
-		gaudi2_handle_qm_sei_err(hdev, event_type, &eq_entry->razwi_info);
+		gaudi2_handle_qm_sei_err(hdev, event_type, &eq_entry->razwi_info, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
 	case GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE:
 	case GAUDI2_EVENT_ROTATOR1_AXI_ERROR_RESPONSE:
 		index = event_type - GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE;
-		gaudi2_handle_rot_err(hdev, index, &eq_entry->razwi_with_intr_cause);
-		gaudi2_handle_qm_sei_err(hdev, event_type, NULL);
+		gaudi2_handle_rot_err(hdev, index, &eq_entry->razwi_with_intr_cause, &event_mask);
+		gaudi2_handle_qm_sei_err(hdev, event_type, NULL, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
 	case GAUDI2_EVENT_TPC0_AXI_ERR_RSP ... GAUDI2_EVENT_TPC24_AXI_ERR_RSP:
 		index = event_type - GAUDI2_EVENT_TPC0_AXI_ERR_RSP;
 		gaudi2_tpc_ack_interrupts(hdev, index, "AXI_ERR_RSP",
-						&eq_entry->razwi_with_intr_cause);
-		gaudi2_handle_qm_sei_err(hdev, event_type, NULL);
+						&eq_entry->razwi_with_intr_cause, &event_mask);
+		gaudi2_handle_qm_sei_err(hdev, event_type, NULL, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
 	case GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE ... GAUDI2_EVENT_DEC9_AXI_ERR_RSPONSE:
 		index = event_type - GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE;
-		gaudi2_handle_dec_err(hdev, index, "AXI_ERR_RESPONSE", &eq_entry->razwi_info);
+		gaudi2_handle_dec_err(hdev, index, "AXI_ERR_RESPONSE", &eq_entry->razwi_info,
+					&event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
@@ -8871,7 +8887,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 	case GAUDI2_EVENT_TPC24_KERNEL_ERR:
 		index = (event_type - GAUDI2_EVENT_TPC0_KERNEL_ERR) /
 			(GAUDI2_EVENT_TPC1_KERNEL_ERR - GAUDI2_EVENT_TPC0_KERNEL_ERR);
-		gaudi2_tpc_ack_interrupts(hdev, index, "KRN_ERR", &eq_entry->razwi_with_intr_cause);
+		gaudi2_tpc_ack_interrupts(hdev, index, "KRN_ERR", &eq_entry->razwi_with_intr_cause,
+						&event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
@@ -8887,7 +8904,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 	case GAUDI2_EVENT_DEC9_SPI:
 		index = (event_type - GAUDI2_EVENT_DEC0_SPI) /
 				(GAUDI2_EVENT_DEC1_SPI - GAUDI2_EVENT_DEC0_SPI);
-		gaudi2_handle_dec_err(hdev, index, "SPI", &eq_entry->razwi_info);
+		gaudi2_handle_dec_err(hdev, index, "SPI", &eq_entry->razwi_info, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
@@ -8899,8 +8916,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 				(GAUDI2_EVENT_MME1_CTRL_AXI_ERROR_RESPONSE -
 						GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE);
 		gaudi2_handle_mme_err(hdev, index,
-				"CTRL_AXI_ERROR_RESPONSE", &eq_entry->razwi_info);
-		gaudi2_handle_qm_sei_err(hdev, event_type, NULL);
+				"CTRL_AXI_ERROR_RESPONSE", &eq_entry->razwi_info, &event_mask);
+		gaudi2_handle_qm_sei_err(hdev, event_type, NULL, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
@@ -8911,7 +8928,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		index = (event_type - GAUDI2_EVENT_MME0_QMAN_SW_ERROR) /
 				(GAUDI2_EVENT_MME1_QMAN_SW_ERROR -
 					GAUDI2_EVENT_MME0_QMAN_SW_ERROR);
-		gaudi2_handle_mme_err(hdev, index, "QMAN_SW_ERROR", &eq_entry->razwi_info);
+		gaudi2_handle_mme_err(hdev, index, "QMAN_SW_ERROR", &eq_entry->razwi_info,
+					&event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
@@ -8922,7 +8940,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		index = (event_type - GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID) /
 				(GAUDI2_EVENT_MME1_WAP_SOURCE_RESULT_INVALID -
 					GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID);
-		gaudi2_handle_mme_wap_err(hdev, index, &eq_entry->razwi_info);
+		gaudi2_handle_mme_wap_err(hdev, index, &eq_entry->razwi_info, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
@@ -8941,7 +8959,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 
 	case GAUDI2_EVENT_PCIE_ADDR_DEC_ERR:
 		gaudi2_print_pcie_addr_dec_info(hdev,
-				le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
+				le64_to_cpu(eq_entry->intr_cause.intr_cause_data), &event_mask);
 		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
 		break;
@@ -8970,7 +8988,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		break;
 
 	case GAUDI2_EVENT_PSOC63_RAZWI_OR_PID_MIN_MAX_INTERRUPT:
-		gaudi2_ack_psoc_razwi_event_handler(hdev);
+		gaudi2_ack_psoc_razwi_event_handler(hdev, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 07/20] habanalabs: fix firmware descriptor copy operation
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
                   ` (4 preceding siblings ...)
  2022-11-17 16:19 ` [PATCH 06/20] habanalabs/gaudi2: add razwi notify event Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 08/20] habanalabs: skip events info ioctl if not supported Oded Gabbay
                   ` (12 subsequent siblings)
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: farah kassabri

From: farah kassabri <fkassabri@habana.ai>

This is needed to allow adding more data to the lkd_fw_comms_desc
structure.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c | 26 ++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index f18e53bbba6b..01c4ffba6e97 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -12,6 +12,7 @@
 #include <linux/crc32.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
+#include <linux/vmalloc.h>
 
 #define FW_FILE_MAX_SIZE		0x1400000 /* maximum size of 20MB */
 
@@ -1988,10 +1989,11 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev,
 						struct fw_load_mgr *fw_loader)
 {
 	struct lkd_fw_comms_desc *fw_desc;
+	void __iomem *src, *temp_fw_desc;
 	struct pci_mem_region *region;
 	struct fw_response *response;
+	u16 fw_data_size;
 	enum pci_region region_id;
-	void __iomem *src;
 	int rc;
 
 	fw_desc = &fw_loader->dynamic_loader.comm_desc;
@@ -2018,9 +2020,29 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev,
 	fw_loader->dynamic_loader.fw_desc_valid = false;
 	src = hdev->pcie_bar[region->bar_id] + region->offset_in_bar +
 							response->ram_offset;
+
+	/*
+	 * We do the copy of the fw descriptor in 2 phases:
+	 * 1. copy the header + data info according to our lkd_fw_comms_desc definition.
+	 *    then we're able to read the actual data size provided by fw.
+	 *    this is needed for cases where data in descriptor was changed(add/remove)
+	 *    in embedded specs header file before updating lkd copy of the header file
+	 * 2. copy descriptor to temporary buffer with aligned size and send it to validation
+	 */
 	memcpy_fromio(fw_desc, src, sizeof(struct lkd_fw_comms_desc));
+	fw_data_size = le16_to_cpu(fw_desc->header.size);
+
+	temp_fw_desc = vzalloc(sizeof(struct comms_desc_header) + fw_data_size);
+	if (!temp_fw_desc)
+		return -ENOMEM;
+
+	memcpy_fromio(temp_fw_desc, src, sizeof(struct comms_desc_header) + fw_data_size);
 
-	return hl_fw_dynamic_validate_descriptor(hdev, fw_loader, fw_desc);
+	rc = hl_fw_dynamic_validate_descriptor(hdev, fw_loader,
+					(struct lkd_fw_comms_desc *) temp_fw_desc);
+	vfree(temp_fw_desc);
+
+	return rc;
 }
 
 /**
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 08/20] habanalabs: skip events info ioctl if not supported
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
                   ` (5 preceding siblings ...)
  2022-11-17 16:19 ` [PATCH 07/20] habanalabs: fix firmware descriptor copy operation Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 09/20] habanalabs/gaudi2: classify power/thermal events as info Oded Gabbay
                   ` (11 subsequent siblings)
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ohad Sharabi

From: Ohad Sharabi <osharabi@habana.ai>

Some ASICs haven't yet implemented this functionality and so the
ioctl call should fail and the user should be notified of the reason.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs_ioctl.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index ee43017eb563..b6abfa7761a7 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -123,6 +123,10 @@ static int hw_events_info(struct hl_device *hdev, bool aggregate,
 		return -EINVAL;
 
 	arr = hdev->asic_funcs->get_events_stat(hdev, aggregate, &size);
+	if (!arr) {
+		dev_err(hdev->dev, "Events info not supported\n");
+		return -EOPNOTSUPP;
+	}
 
 	return copy_to_user(out, arr, min(max_size, size)) ? -EFAULT : 0;
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 09/20] habanalabs/gaudi2: classify power/thermal events as info
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
                   ` (6 preceding siblings ...)
  2022-11-17 16:19 ` [PATCH 08/20] habanalabs: skip events info ioctl if not supported Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 10/20] habanalabs/gaudi2: add page fault notify event Oded Gabbay
                   ` (10 subsequent siblings)
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

As power and thermal envelope events are pure informative and not
indicating an error, we reduce the print level to info only.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 59940c8df2d2..61960fa059e0 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -6828,6 +6828,7 @@ static inline bool is_info_event(u32 event)
 {
 	switch (event) {
 	case GAUDI2_EVENT_CPU_CPLD_SHUTDOWN_CAUSE:
+	case GAUDI2_EVENT_CPU_FIX_POWER_ENV_S ... GAUDI2_EVENT_CPU_FIX_THERMAL_ENV_E:
 		return true;
 	default:
 		return false;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 10/20] habanalabs/gaudi2: add page fault notify event
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
                   ` (7 preceding siblings ...)
  2022-11-17 16:19 ` [PATCH 09/20] habanalabs/gaudi2: classify power/thermal events as info Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 11/20] habanalabs: fix print for out-of-sync and pkt-failure events Oded Gabbay
                   ` (9 subsequent siblings)
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

Each time page fault happens, besides capturing its data, also notify
the user about it.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 61960fa059e0..65c9b535aa69 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -8253,7 +8253,8 @@ static void gaudi2_handle_hif_fatal(struct hl_device *hdev, u16 event_type, u64
 	}
 }
 
-static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool is_pmmu)
+static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool is_pmmu,
+					u64 *event_mask)
 {
 	u32 valid, val;
 	u64 addr;
@@ -8270,7 +8271,7 @@ static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool
 
 	dev_err_ratelimited(hdev->dev, "%s page fault on va 0x%llx\n",
 				is_pmmu ? "PMMU" : "HMMU", addr);
-	hl_capture_page_fault(hdev, addr, 0, is_pmmu);
+	hl_handle_page_fault(hdev, addr, 0, is_pmmu, event_mask);
 
 	WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_ERROR_CAPTURE), 0);
 }
@@ -8296,7 +8297,7 @@ static void gaudi2_handle_access_error(struct hl_device *hdev, u64 mmu_base, boo
 }
 
 static void gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, const char *mmu_name,
-						u64 mmu_base, bool is_pmmu)
+						u64 mmu_base, bool is_pmmu, u64 *event_mask)
 {
 	u32 spi_sei_cause, interrupt_clr = 0x0;
 	int i;
@@ -8309,7 +8310,7 @@ static void gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, const char
 						mmu_name, gaudi2_mmu_spi_sei[i].cause);
 
 			if (i == 0)
-				gaudi2_handle_page_error(hdev, mmu_base, is_pmmu);
+				gaudi2_handle_page_error(hdev, mmu_base, is_pmmu, event_mask);
 			else if (i == 1)
 				gaudi2_handle_access_error(hdev, mmu_base, is_pmmu);
 
@@ -8381,7 +8382,7 @@ static bool gaudi2_handle_sm_err(struct hl_device *hdev, u8 sm_index)
 	return reset;
 }
 
-static void gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type)
+static void gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type, u64 *event_mask)
 {
 	bool is_pmmu = false;
 	char desc[32];
@@ -8439,7 +8440,7 @@ static void gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type
 		return;
 	}
 
-	gaudi2_handle_mmu_spi_sei_generic(hdev, desc, mmu_base, is_pmmu);
+	gaudi2_handle_mmu_spi_sei_generic(hdev, desc, mmu_base, is_pmmu, event_mask);
 }
 
 
@@ -8969,7 +8970,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 	case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP:
 	case GAUDI2_EVENT_PMMU0_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_PMMU0_SECURITY_ERROR:
 	case GAUDI2_EVENT_PMMU_AXI_ERR_RSP_0:
-		gaudi2_handle_mmu_spi_sei_err(hdev, event_type);
+		gaudi2_handle_mmu_spi_sei_err(hdev, event_type, &event_mask);
 		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
@@ -10206,7 +10207,7 @@ static void gaudi2_ack_mmu_error(struct hl_device *hdev, u64 mmu_id)
 	if (gaudi2_get_mmu_base(hdev, mmu_id, &mmu_base))
 		return;
 
-	gaudi2_handle_page_error(hdev, mmu_base, is_pmmu);
+	gaudi2_handle_page_error(hdev, mmu_base, is_pmmu, NULL);
 	gaudi2_handle_access_error(hdev, mmu_base, is_pmmu);
 }
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 11/20] habanalabs: fix print for out-of-sync and pkt-failure events
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
                   ` (8 preceding siblings ...)
  2022-11-17 16:19 ` [PATCH 10/20] habanalabs/gaudi2: add page fault notify event Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 12/20] habanalabs/gaudi: fix print for firmware-alive event Oded Gabbay
                   ` (8 subsequent siblings)
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

Add missing le32_to_cpu() conversions, and use %d for the value
returned from atomic_read().

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c   | 4 ++--
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 8 ++++----
 drivers/misc/habanalabs/goya/goya.c     | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index cbe1daf5a793..7b93f0d26dd0 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7347,8 +7347,8 @@ static void gaudi_print_out_of_sync_info(struct hl_device *hdev,
 {
 	struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI_QUEUE_ID_CPU_PQ];
 
-	dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n",
-			sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci));
+	dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n",
+		le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci));
 }
 
 static void gaudi_print_fw_alive_info(struct hl_device *hdev,
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 65c9b535aa69..bdb5782afb7e 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -8684,8 +8684,8 @@ static void gaudi2_print_out_of_sync_info(struct hl_device *hdev,
 {
 	struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI2_QUEUE_ID_CPU_PQ];
 
-	dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n",
-			sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci));
+	dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n",
+		le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci));
 }
 
 static void gaudi2_handle_pcie_p2p_msix(struct hl_device *hdev)
@@ -8751,8 +8751,8 @@ static void gaudi2_print_cpu_pkt_failure_info(struct hl_device *hdev,
 	struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI2_QUEUE_ID_CPU_PQ];
 
 	dev_warn(hdev->dev,
-		"FW reported sanity check failure, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n",
-		sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci));
+		"FW reported sanity check failure, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n",
+		le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci));
 }
 
 static void hl_arc_event_handle(struct hl_device *hdev,
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 5ef9e3ca97a6..0f083fcf81a6 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -4475,8 +4475,8 @@ static void goya_print_out_of_sync_info(struct hl_device *hdev,
 {
 	struct hl_hw_queue *q = &hdev->kernel_queues[GOYA_QUEUE_ID_CPU_PQ];
 
-	dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n",
-			sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci));
+	dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n",
+		le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci));
 }
 
 static void goya_print_irq_info(struct hl_device *hdev, u16 event_type,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 12/20] habanalabs/gaudi: fix print for firmware-alive event
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
                   ` (9 preceding siblings ...)
  2022-11-17 16:19 ` [PATCH 11/20] habanalabs: fix print for out-of-sync and pkt-failure events Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 13/20] habanalabs/gaudi2: remove redundant firmware version check Oded Gabbay
                   ` (7 subsequent siblings)
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

Add missing le{32,64}_to_cpu conversions.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 7b93f0d26dd0..9f5e208701ba 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7356,9 +7356,10 @@ static void gaudi_print_fw_alive_info(struct hl_device *hdev,
 {
 	dev_err(hdev->dev,
 		"FW alive report: severity=%s, process_id=%u, thread_id=%u, uptime=%llu seconds\n",
-		(fw_alive->severity == FW_ALIVE_SEVERITY_MINOR) ?
-		"Minor" : "Critical", fw_alive->process_id,
-		fw_alive->thread_id, fw_alive->uptime_seconds);
+		(fw_alive->severity == FW_ALIVE_SEVERITY_MINOR) ? "Minor" : "Critical",
+		le32_to_cpu(fw_alive->process_id),
+		le32_to_cpu(fw_alive->thread_id),
+		le64_to_cpu(fw_alive->uptime_seconds));
 }
 
 static void gaudi_print_nic_axi_irq_info(struct hl_device *hdev, u16 event_type,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 13/20] habanalabs/gaudi2: remove redundant firmware version check
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
                   ` (10 preceding siblings ...)
  2022-11-17 16:19 ` [PATCH 12/20] habanalabs/gaudi: fix print for firmware-alive event Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 14/20] habanalabs/gaudi2: don't enable entries in the MSIX_GW table Oded Gabbay
                   ` (6 subsequent siblings)
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: farah kassabri

From: farah kassabri <fkassabri@habana.ai>

Firmware 1.7 is the first official firmware, so no need to check
if we are running a version below it.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index bdb5782afb7e..36f0ea1100bb 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -10358,10 +10358,9 @@ int gaudi2_send_device_activity(struct hl_device *hdev, bool open)
 {
 	struct gaudi2_device *gaudi2 = hdev->asic_specific;
 
-	if (!(gaudi2->hw_cap_initialized & HW_CAP_CPU_Q) || hdev->fw_major_version < 37)
+	if (!(gaudi2->hw_cap_initialized & HW_CAP_CPU_Q))
 		return 0;
 
-	/* TODO: add check for FW version using minor ver once it's known */
 	return hl_fw_send_device_activity(hdev, open);
 }
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 14/20] habanalabs/gaudi2: don't enable entries in the MSIX_GW table
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
                   ` (11 preceding siblings ...)
  2022-11-17 16:19 ` [PATCH 13/20] habanalabs/gaudi2: remove redundant firmware version check Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 15/20] habanalabs/gaudi2: return to reset upon SM SEI BRESP error Oded Gabbay
                   ` (5 subsequent siblings)
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

User should use the virtual MSI-X doorbell to generate interrupts from
the device, so there is no need to enable entries in the MSIX_GW table.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 26 -------------------------
 1 file changed, 26 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 36f0ea1100bb..d5efec347bc1 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -4695,30 +4695,6 @@ static void gaudi2_init_dec(struct hl_device *hdev)
 	}
 }
 
-static void gaudi2_init_msix_gw_table(struct hl_device *hdev)
-{
-	u32 first_reg_offset, last_reg_offset, msix_gw_table_base;
-	u8 first_bit, last_bit;
-	int i;
-
-	msix_gw_table_base = mmPCIE_WRAP_MSIX_GW_TABLE_0;
-	first_reg_offset = (GAUDI2_IRQ_NUM_USER_FIRST >> 5) << 2;
-	first_bit = GAUDI2_IRQ_NUM_USER_FIRST % 32;
-	last_reg_offset = (GAUDI2_IRQ_NUM_USER_LAST >> 5) << 2;
-	last_bit = GAUDI2_IRQ_NUM_USER_LAST % 32;
-
-	if (first_reg_offset == last_reg_offset) {
-		WREG32(msix_gw_table_base + first_reg_offset, GENMASK(last_bit, first_bit));
-		return;
-	}
-
-	WREG32(msix_gw_table_base + first_reg_offset, GENMASK(31, first_bit));
-	WREG32(msix_gw_table_base + last_reg_offset, GENMASK(last_bit, 0));
-
-	for (i = first_reg_offset + 4; i < last_reg_offset ; i += 4)
-		WREG32(msix_gw_table_base + i, 0xFFFFFFFF);
-}
-
 static int gaudi2_mmu_update_asid_hop0_addr(struct hl_device *hdev,
 					u32 stlb_base, u32 asid, u64 phys_addr)
 {
@@ -5232,8 +5208,6 @@ static int gaudi2_hw_init(struct hl_device *hdev)
 		return rc;
 	}
 
-	gaudi2_init_msix_gw_table(hdev);
-
 	gaudi2_init_scrambler_hbm(hdev);
 	gaudi2_init_kdma(hdev);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 15/20] habanalabs/gaudi2: return to reset upon SM SEI BRESP error
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
                   ` (12 preceding siblings ...)
  2022-11-17 16:19 ` [PATCH 14/20] habanalabs/gaudi2: don't enable entries in the MSIX_GW table Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 16/20] habanalabs: reset device if still in use when released Oded Gabbay
                   ` (4 subsequent siblings)
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

Due to a H/W issue in the LBW path to the PCIE_DBI MSI-X doorbell, there
were false sporadic error responses in SM when it was configured to
write to there, and hence no reset was done as part of handling the
relevant event.
Now that the virtual MSI-X doorbell is used, such errors in SM are not
expected and reset shouldn't be skipped.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index d5efec347bc1..f0f2f77f56de 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -8300,11 +8300,10 @@ static void gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, const char
 	WREG32(mmu_base + MMU_INTERRUPT_CLR_OFFSET, interrupt_clr);
 }
 
-static bool gaudi2_handle_sm_err(struct hl_device *hdev, u8 sm_index)
+static void gaudi2_handle_sm_err(struct hl_device *hdev, u8 sm_index)
 {
 	u32 sei_cause_addr, sei_cause_val, sei_cause_cause, sei_cause_log;
 	u32 cq_intr_addr, cq_intr_val, cq_intr_queue_index;
-	bool reset = true;
 	int i;
 
 	sei_cause_addr = mmDCORE0_SYNC_MNGR_GLBL_SM_SEI_CAUSE + DCORE_OFFSET * sm_index;
@@ -8329,10 +8328,6 @@ static bool gaudi2_handle_sm_err(struct hl_device *hdev, u8 sm_index)
 					gaudi2_sm_sei_cause[i].cause_name,
 					gaudi2_sm_sei_cause[i].log_name,
 					sei_cause_log & gaudi2_sm_sei_cause[i].log_mask);
-
-			/* Due to a potential H/W issue, do not reset upon BRESP errors */
-			if (i == 2)
-				reset = false;
 			break;
 		}
 
@@ -8352,8 +8347,6 @@ static bool gaudi2_handle_sm_err(struct hl_device *hdev, u8 sm_index)
 		/* Clear CQ_INTR */
 		WREG32(cq_intr_addr, 0);
 	}
-
-	return reset;
 }
 
 static void gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type, u64 *event_mask)
@@ -8755,8 +8748,8 @@ static void hl_arc_event_handle(struct hl_device *hdev,
 
 static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
 {
-	bool reset_required = false, skip_reset = false, is_critical = false;
 	struct gaudi2_device *gaudi2 = hdev->asic_specific;
+	bool reset_required = false, is_critical = false;
 	u32 ctl, reset_flags = HL_DRV_RESET_HARD;
 	int index, sbte_index;
 	u64 event_mask = 0;
@@ -9113,7 +9106,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 
 	case GAUDI2_EVENT_SM0_AXI_ERROR_RESPONSE ... GAUDI2_EVENT_SM3_AXI_ERROR_RESPONSE:
 		index = event_type - GAUDI2_EVENT_SM0_AXI_ERROR_RESPONSE;
-		skip_reset = !gaudi2_handle_sm_err(hdev, index);
+		gaudi2_handle_sm_err(hdev, index);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
@@ -9153,9 +9146,9 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 						event_type);
 	}
 
-	if ((gaudi2_irq_map_table[event_type].reset || reset_required) && !skip_reset &&
-			(hdev->hard_reset_on_fw_events ||
-			(hdev->asic_prop.fw_security_enabled && is_critical)))
+	if ((gaudi2_irq_map_table[event_type].reset || reset_required) &&
+				(hdev->hard_reset_on_fw_events ||
+				(hdev->asic_prop.fw_security_enabled && is_critical)))
 		goto reset_device;
 
 	/* Send unmask irq only for interrupts not classified as MSG */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 16/20] habanalabs: reset device if still in use when released
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
                   ` (13 preceding siblings ...)
  2022-11-17 16:19 ` [PATCH 15/20] habanalabs/gaudi2: return to reset upon SM SEI BRESP error Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 17/20] habanalabs: check schedule_hard_reset correctly Oded Gabbay
                   ` (3 subsequent siblings)
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

If the device file is released while a context is still held, it won't
be possible to reopen it until the context is eventually released.
If that doesn't happen, only a device reset will revert it back to an
operational state, i.e. need to wait for a CS timeout or an error, or to
wait for an external intervention of injecting a reset via sysfs.

At this stage, after the device was released by user, context is held
either because of CS which were left running on the device and are not
relevant anymore, or due to missing cleanup steps from user side.

All of this is in any case handled in the device reset flow, so initiate
the reset at this point instead of waiting for it.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 708db0f48ee0..49640c8ca910 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -504,9 +504,10 @@ static int hl_device_release(struct inode *inode, struct file *filp)
 
 	hdev->compute_ctx_in_release = 1;
 
-	if (!hl_hpriv_put(hpriv))
-		dev_notice(hdev->dev,
-			"User process closed FD but device still in use\n");
+	if (!hl_hpriv_put(hpriv)) {
+		dev_notice(hdev->dev, "User process closed FD but device still in use\n");
+		hl_device_reset(hdev, HL_DRV_RESET_HARD);
+	}
 
 	hdev->last_open_session_duration_jif =
 		jiffies - hdev->last_successful_open_jif;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 17/20] habanalabs: check schedule_hard_reset correctly
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
                   ` (14 preceding siblings ...)
  2022-11-17 16:19 ` [PATCH 16/20] habanalabs: reset device if still in use when released Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 18/20] habanalabs: extend process wait timeout in device fine Oded Gabbay
                   ` (2 subsequent siblings)
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tomer Tayar

schedule_hard_reset can be true only if we didn't do hard-reset.
Therefore, no point of checking it in case hard_reset is true.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Reviewed-by: Tomer Tayar <ttayar@habana.ai>
---
 drivers/misc/habanalabs/common/device.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 49640c8ca910..0650e511a0f5 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -1737,18 +1737,19 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		 * the device will be operational although it shouldn't be
 		 */
 		hdev->asic_funcs->enable_events_from_fw(hdev);
-	} else if (!reset_upon_device_release) {
-		hdev->reset_info.compute_reset_cnt++;
-	}
-
-	if (schedule_hard_reset) {
-		dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n");
-		flags = hdev->reset_info.hard_reset_schedule_flags;
-		hdev->reset_info.hard_reset_schedule_flags = 0;
-		hdev->disabled = true;
-		hard_reset = true;
-		handle_reset_trigger(hdev, flags);
-		goto again;
+	} else {
+		if (!reset_upon_device_release)
+			hdev->reset_info.compute_reset_cnt++;
+
+		if (schedule_hard_reset) {
+			dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n");
+			flags = hdev->reset_info.hard_reset_schedule_flags;
+			hdev->reset_info.hard_reset_schedule_flags = 0;
+			hdev->disabled = true;
+			hard_reset = true;
+			handle_reset_trigger(hdev, flags);
+			goto again;
+		}
 	}
 
 	return 0;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 18/20] habanalabs: extend process wait timeout in device fine
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
                   ` (15 preceding siblings ...)
  2022-11-17 16:19 ` [PATCH 17/20] habanalabs: check schedule_hard_reset correctly Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 19/20] habanalabs/gaudi2: change memory scrub mechanism Oded Gabbay
  2022-11-17 16:19 ` [PATCH 20/20] habanalabs: increase the size of busy engines mask Oded Gabbay
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tomer Tayar

Processes that use our device are likely to use at the same time other
devices such as remote storage.

In case our device is removed and a user process is still using the
device, we need to kill the user process. However, if that process
has a thread waiting for i/o to complete on remote storage, for example,
the process won't terminate.

Let's give it enough time to terminate before giving up.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Reviewed-by: Tomer Tayar <ttayar@habana.ai>
---
 drivers/misc/habanalabs/common/device.c     |  6 ++++--
 drivers/misc/habanalabs/common/habanalabs.h | 11 ++++++++---
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 0650e511a0f5..63d0cb7087e8 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -2300,14 +2300,16 @@ void hl_device_fini(struct hl_device *hdev)
 	 */
 	dev_info(hdev->dev,
 		"Waiting for all processes to exit (timeout of %u seconds)",
-		HL_PENDING_RESET_LONG_SEC);
+		HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI);
 
-	rc = device_kill_open_processes(hdev, HL_PENDING_RESET_LONG_SEC, false);
+	hdev->process_kill_trial_cnt = 0;
+	rc = device_kill_open_processes(hdev, HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI, false);
 	if (rc) {
 		dev_crit(hdev->dev, "Failed to kill all open processes\n");
 		device_disable_open_processes(hdev, false);
 	}
 
+	hdev->process_kill_trial_cnt = 0;
 	rc = device_kill_open_processes(hdev, 0, true);
 	if (rc) {
 		dev_crit(hdev->dev, "Failed to kill all control device open processes\n");
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 0781b8698f74..e7f89868428d 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -50,9 +50,14 @@ struct hl_fpriv;
 #define HL_MMAP_OFFSET_VALUE_MASK	(0x1FFFFFFFFFFFull >> PAGE_SHIFT)
 #define HL_MMAP_OFFSET_VALUE_GET(off)	(off & HL_MMAP_OFFSET_VALUE_MASK)
 
-#define HL_PENDING_RESET_PER_SEC	10
-#define HL_PENDING_RESET_MAX_TRIALS	60 /* 10 minutes */
-#define HL_PENDING_RESET_LONG_SEC	60
+#define HL_PENDING_RESET_PER_SEC		10
+#define HL_PENDING_RESET_MAX_TRIALS		60 /* 10 minutes */
+#define HL_PENDING_RESET_LONG_SEC		60
+/*
+ * In device fini, wait 10 minutes for user processes to be terminated after we kill them.
+ * This is needed to prevent situation of clearing resources while user processes are still alive.
+ */
+#define HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI	600
 
 #define HL_HARD_RESET_MAX_TIMEOUT	120
 #define HL_PLDM_HARD_RESET_MAX_TIMEOUT	(HL_HARD_RESET_MAX_TIMEOUT * 3)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 19/20] habanalabs/gaudi2: change memory scrub mechanism
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
                   ` (16 preceding siblings ...)
  2022-11-17 16:19 ` [PATCH 18/20] habanalabs: extend process wait timeout in device fine Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  2022-11-17 16:19 ` [PATCH 20/20] habanalabs: increase the size of busy engines mask Oded Gabbay
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: farah kassabri

From: farah kassabri <fkassabri@habana.ai>

Currently the scrubbing mechanism used the EDMA engines by directly
setting the engine core registers to scrub a chunk of memory.
Due to a sporadic failure with this mechanism, it was decided to
initiate the engines via its QMAN using LIN-DMA packets.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 129 +++++++++++++++---------
 1 file changed, 83 insertions(+), 46 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index f0f2f77f56de..c14e63164a84 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -9171,34 +9171,74 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 	hl_device_cond_reset(hdev, reset_flags, event_mask);
 }
 
+static int gaudi2_memset_memory_chunk_using_edma_qm(struct hl_device *hdev,
+			struct packet_lin_dma *lin_dma_pkt, dma_addr_t pkt_dma_addr,
+			u32 hw_queue_id, u32 size, u64 addr, u32 val)
+{
+	u32 ctl, pkt_size;
+	int rc = 0;
+
+	ctl = FIELD_PREP(GAUDI2_PKT_CTL_OPCODE_MASK, PACKET_LIN_DMA);
+	ctl |= FIELD_PREP(GAUDI2_PKT_LIN_DMA_CTL_MEMSET_MASK, 1);
+	ctl |= FIELD_PREP(GAUDI2_PKT_LIN_DMA_CTL_WRCOMP_MASK, 1);
+	ctl |= FIELD_PREP(GAUDI2_PKT_CTL_EB_MASK, 1);
+
+	lin_dma_pkt->ctl = cpu_to_le32(ctl);
+	lin_dma_pkt->src_addr = cpu_to_le64(val);
+	lin_dma_pkt->dst_addr = cpu_to_le64(addr);
+	lin_dma_pkt->tsize = cpu_to_le32(size);
+
+	pkt_size = sizeof(struct packet_lin_dma);
+
+	rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, pkt_dma_addr);
+	if (rc)
+		dev_err(hdev->dev, "Failed to send lin dma packet to H/W queue %d\n",
+				hw_queue_id);
+
+	return rc;
+}
+
 static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size, u64 val)
 {
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u32 edma_queues_id[] = {GAUDI2_QUEUE_ID_DCORE0_EDMA_0_0,
+					GAUDI2_QUEUE_ID_DCORE1_EDMA_0_0,
+					GAUDI2_QUEUE_ID_DCORE2_EDMA_0_0,
+					GAUDI2_QUEUE_ID_DCORE3_EDMA_0_0};
+	u32 chunk_size, dcore, edma_idx, sob_offset, sob_addr, comp_val,
+		old_mmubp, mmubp, num_of_pkts, busy, pkt_size;
 	u64 comp_addr, cur_addr = addr, end_addr = addr + size;
-	u32 chunk_size, busy, dcore, edma_idx, sob_offset, sob_addr, comp_val, edma_commit;
-	u32 old_mmubp, mmubp;
-	int rc = 0;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	void *lin_dma_pkts_arr;
+	dma_addr_t pkt_dma_addr;
+	int rc = 0, dma_num = 0;
+
+	if (prop->edma_enabled_mask == 0) {
+		dev_info(hdev->dev, "non of the EDMA engines is enabled - skip dram scrubbing\n");
+		return -EIO;
+	}
 
 	sob_offset = hdev->asic_prop.first_available_user_sob[0] * 4;
 	sob_addr = mmDCORE0_SYNC_MNGR_OBJS_SOB_OBJ_0 + sob_offset;
 	comp_addr = CFG_BASE + sob_addr;
 	comp_val = FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_INC_MASK, 1) |
 		FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_VAL_MASK, 1);
-
-	edma_commit = FIELD_PREP(ARC_FARM_KDMA_CTX_COMMIT_LIN_MASK, 1) |
-			FIELD_PREP(ARC_FARM_KDMA_CTX_COMMIT_MEM_SET_MASK, 1) |
-			FIELD_PREP(ARC_FARM_KDMA_CTX_COMMIT_WR_COMP_EN_MASK, 1);
 	mmubp = FIELD_PREP(ARC_FARM_KDMA_CTX_AXUSER_HB_MMU_BP_WR_MASK, 1) |
 		FIELD_PREP(ARC_FARM_KDMA_CTX_AXUSER_HB_MMU_BP_RD_MASK, 1);
 
-	if (prop->edma_enabled_mask == 0) {
-		dev_info(hdev->dev, "non of the EDMA engines is enabled - skip dram scrubbing\n");
-		return -EIO;
-	}
+	/* Calculate how many lin dma pkts we'll need */
+	num_of_pkts = div64_u64(round_up(size, SZ_2G), SZ_2G);
+	pkt_size = sizeof(struct packet_lin_dma);
+
+	lin_dma_pkts_arr = hl_asic_dma_alloc_coherent(hdev, pkt_size * num_of_pkts,
+					&pkt_dma_addr, GFP_KERNEL);
+	if (!lin_dma_pkts_arr)
+		return -ENOMEM;
 
 	/*
 	 * set mmu bypass for the scrubbing - all ddmas are configured the same so save
 	 * only the first one to restore later
+	 * also set the sob addr for all edma cores for completion.
+	 * set QM as trusted to allow it to access physical address with MMU bp.
 	 */
 	old_mmubp = RREG32(mmDCORE0_EDMA0_CORE_CTX_AXUSER_HB_MMU_BP);
 	for (dcore = 0 ; dcore < NUM_OF_DCORES ; dcore++) {
@@ -9211,17 +9251,22 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz
 
 			WREG32(mmDCORE0_EDMA0_CORE_CTX_AXUSER_HB_MMU_BP +
 					edma_offset, mmubp);
+			WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_LO + edma_offset,
+					lower_32_bits(comp_addr));
+			WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_HI + edma_offset,
+					upper_32_bits(comp_addr));
+			WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_WDATA + edma_offset,
+					comp_val);
+			gaudi2_qman_set_test_mode(hdev,
+					edma_queues_id[dcore] + 4 * edma_idx, true);
 		}
 	}
 
-	while (cur_addr < end_addr) {
-		int dma_num = 0;
+	WREG32(sob_addr, 0);
 
-		WREG32(sob_addr, 0);
+	while (cur_addr < end_addr) {
 		for (dcore = 0 ; dcore < NUM_OF_DCORES ; dcore++) {
 			for (edma_idx = 0 ; edma_idx < NUM_OF_EDMA_PER_DCORE ; edma_idx++) {
-				u32 edma_offset = dcore * DCORE_OFFSET +
-					edma_idx * DCORE_EDMA_OFFSET;
 				u32 edma_bit = dcore * NUM_OF_EDMA_PER_DCORE + edma_idx;
 
 				if (!(prop->edma_enabled_mask & BIT(edma_bit)))
@@ -9229,41 +9274,26 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz
 
 				chunk_size = min_t(u64, SZ_2G, end_addr - cur_addr);
 
-				WREG32(mmDCORE0_EDMA0_CORE_CTX_SRC_BASE_LO + edma_offset,
-						lower_32_bits(val));
-				WREG32(mmDCORE0_EDMA0_CORE_CTX_SRC_BASE_HI + edma_offset,
-						upper_32_bits(val));
-
-				WREG32(mmDCORE0_EDMA0_CORE_CTX_DST_BASE_LO + edma_offset,
-						lower_32_bits(cur_addr));
-				WREG32(mmDCORE0_EDMA0_CORE_CTX_DST_BASE_HI + edma_offset,
-						upper_32_bits(cur_addr));
-
-				WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_LO + edma_offset,
-						lower_32_bits(comp_addr));
-				WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_HI + edma_offset,
-						upper_32_bits(comp_addr));
-				WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_WDATA + edma_offset,
-						comp_val);
-
-				WREG32(mmDCORE0_EDMA0_CORE_CTX_DST_TSIZE_0 + edma_offset,
-						chunk_size);
-				WREG32(mmDCORE0_EDMA0_CORE_CTX_COMMIT + edma_offset, edma_commit);
+				rc = gaudi2_memset_memory_chunk_using_edma_qm(hdev,
+					(struct packet_lin_dma *)lin_dma_pkts_arr + dma_num,
+					pkt_dma_addr + dma_num * pkt_size,
+					edma_queues_id[dcore] + edma_idx * 4,
+					chunk_size, cur_addr, val);
+				if (rc)
+					goto end;
 
 				dma_num++;
-
 				cur_addr += chunk_size;
-
 				if (cur_addr == end_addr)
-					goto poll;
+					break;
 			}
 		}
-poll:
-		rc = hl_poll_timeout(hdev, sob_addr, busy, (busy == dma_num), 1000, 1000000);
-		if (rc) {
-			dev_err(hdev->dev, "DMA Timeout during HBM scrubbing\n");
-			goto end;
-		}
+	}
+
+	rc = hl_poll_timeout(hdev, sob_addr, busy, (busy == dma_num), 1000, 1000000);
+	if (rc) {
+		dev_err(hdev->dev, "DMA Timeout during HBM scrubbing\n");
+		goto end;
 	}
 end:
 	for (dcore = 0 ; dcore < NUM_OF_DCORES ; dcore++) {
@@ -9275,10 +9305,17 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz
 				continue;
 
 			WREG32(mmDCORE0_EDMA0_CORE_CTX_AXUSER_HB_MMU_BP + edma_offset, old_mmubp);
+			WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_LO + edma_offset, 0);
+			WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_HI + edma_offset, 0);
+			WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_WDATA + edma_offset, 0);
+			gaudi2_qman_set_test_mode(hdev,
+					edma_queues_id[dcore] + 4 * edma_idx, false);
 		}
 	}
 
 	WREG32(sob_addr, 0);
+	hl_asic_dma_free_coherent(hdev, pkt_size * num_of_pkts, lin_dma_pkts_arr, pkt_dma_addr);
+
 	return rc;
 }
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 20/20] habanalabs: increase the size of busy engines mask
  2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
                   ` (17 preceding siblings ...)
  2022-11-17 16:19 ` [PATCH 19/20] habanalabs/gaudi2: change memory scrub mechanism Oded Gabbay
@ 2022-11-17 16:19 ` Oded Gabbay
  18 siblings, 0 replies; 20+ messages in thread
From: Oded Gabbay @ 2022-11-17 16:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

Increase the size of the busy engines mask in 'struct hl_info_hw_idle',
for future ASICs with more than 128 engines.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 9 +++++----
 include/uapi/misc/habanalabs.h          | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 63d0cb7087e8..f5864893237c 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -416,8 +416,9 @@ static void hpriv_release(struct kref *ref)
 		device_is_idle = hdev->asic_funcs->is_device_idle(hdev, idle_mask,
 							HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL);
 	if (!device_is_idle) {
-		dev_err(hdev->dev, "device not idle after user context is closed (0x%llx_%llx)\n",
-			idle_mask[1], idle_mask[0]);
+		dev_err(hdev->dev,
+			"device not idle after user context is closed (0x%llx_%llx_%llx_%llx)\n",
+			idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]);
 		reset_device = true;
 	}
 
@@ -1661,8 +1662,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	/* If device is not idle fail the reset process */
 	if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
 			HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
-		dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx) after reset\n",
-			idle_mask[1], idle_mask[0]);
+		dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx_%llx_%llx) after reset\n",
+			idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]);
 		rc = -EIO;
 		goto out_err;
 	}
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index e50cb71df081..3b995e841eb8 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -916,7 +916,7 @@ struct hl_info_dram_usage {
 	__u64 ctx_dram_mem;
 };
 
-#define HL_BUSY_ENGINES_MASK_EXT_SIZE	2
+#define HL_BUSY_ENGINES_MASK_EXT_SIZE	4
 
 struct hl_info_hw_idle {
 	__u32 is_idle;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2022-11-17 16:22 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-17 16:19 [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support Oded Gabbay
2022-11-17 16:19 ` [PATCH 02/20] habanalabs/gaudi: add razwi notify event Oded Gabbay
2022-11-17 16:19 ` [PATCH 03/20] habanalabs: use single threaded WQ for event handling Oded Gabbay
2022-11-17 16:19 ` [PATCH 04/20] habanalabs/gaudi: add page fault notify event Oded Gabbay
2022-11-17 16:19 ` [PATCH 05/20] habanalabs/gaudi2: implement fp32 not supported event Oded Gabbay
2022-11-17 16:19 ` [PATCH 06/20] habanalabs/gaudi2: add razwi notify event Oded Gabbay
2022-11-17 16:19 ` [PATCH 07/20] habanalabs: fix firmware descriptor copy operation Oded Gabbay
2022-11-17 16:19 ` [PATCH 08/20] habanalabs: skip events info ioctl if not supported Oded Gabbay
2022-11-17 16:19 ` [PATCH 09/20] habanalabs/gaudi2: classify power/thermal events as info Oded Gabbay
2022-11-17 16:19 ` [PATCH 10/20] habanalabs/gaudi2: add page fault notify event Oded Gabbay
2022-11-17 16:19 ` [PATCH 11/20] habanalabs: fix print for out-of-sync and pkt-failure events Oded Gabbay
2022-11-17 16:19 ` [PATCH 12/20] habanalabs/gaudi: fix print for firmware-alive event Oded Gabbay
2022-11-17 16:19 ` [PATCH 13/20] habanalabs/gaudi2: remove redundant firmware version check Oded Gabbay
2022-11-17 16:19 ` [PATCH 14/20] habanalabs/gaudi2: don't enable entries in the MSIX_GW table Oded Gabbay
2022-11-17 16:19 ` [PATCH 15/20] habanalabs/gaudi2: return to reset upon SM SEI BRESP error Oded Gabbay
2022-11-17 16:19 ` [PATCH 16/20] habanalabs: reset device if still in use when released Oded Gabbay
2022-11-17 16:19 ` [PATCH 17/20] habanalabs: check schedule_hard_reset correctly Oded Gabbay
2022-11-17 16:19 ` [PATCH 18/20] habanalabs: extend process wait timeout in device fine Oded Gabbay
2022-11-17 16:19 ` [PATCH 19/20] habanalabs/gaudi2: change memory scrub mechanism Oded Gabbay
2022-11-17 16:19 ` [PATCH 20/20] habanalabs: increase the size of busy engines mask Oded Gabbay

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.