All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/12] accel/habanalabs: rename security functions related arguments
@ 2023-05-16  9:30 Oded Gabbay
  2023-05-16  9:30 ` [PATCH 02/12] accel/habanalabs: set unused bit as reserved Oded Gabbay
                   ` (10 more replies)
  0 siblings, 11 replies; 15+ messages in thread
From: Oded Gabbay @ 2023-05-16  9:30 UTC (permalink / raw)
  To: dri-devel; +Cc: Koby Elbaz

From: Koby Elbaz <kelbaz@habana.ai>

Make the argument names specify the registers array represent
registers that should be unsecured so the user can access them.

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/security.c | 57 +++++++++++-----------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/drivers/accel/habanalabs/common/security.c b/drivers/accel/habanalabs/common/security.c
index dc23ff57c91a..fe913965dbad 100644
--- a/drivers/accel/habanalabs/common/security.c
+++ b/drivers/accel/habanalabs/common/security.c
@@ -284,14 +284,14 @@ void hl_secure_block(struct hl_device *hdev,
  * @instance_offset: offset between instances
  * @pb_blocks: blocks array
  * @blocks_array_size: blocks array size
- * @regs_array: register array
- * @regs_array_size: register array size
+ * @user_regs_array: unsecured register array
+ * @user_regs_array_size: unsecured register array size
  * @mask: enabled instances mask: 1- enabled, 0- disabled
  */
 int hl_init_pb_with_mask(struct hl_device *hdev, u32 num_dcores,
 		u32 dcore_offset, u32 num_instances, u32 instance_offset,
 		const u32 pb_blocks[], u32 blocks_array_size,
-		const u32 *regs_array, u32 regs_array_size, u64 mask)
+		const u32 *user_regs_array, u32 user_regs_array_size, u64 mask)
 {
 	int i, j;
 	struct hl_block_glbl_sec *glbl_sec;
@@ -303,8 +303,8 @@ int hl_init_pb_with_mask(struct hl_device *hdev, u32 num_dcores,
 		return -ENOMEM;
 
 	hl_secure_block(hdev, glbl_sec, blocks_array_size);
-	hl_unsecure_registers(hdev, regs_array, regs_array_size, 0, pb_blocks,
-			glbl_sec, blocks_array_size);
+	hl_unsecure_registers(hdev, user_regs_array, user_regs_array_size, 0,
+			pb_blocks, glbl_sec, blocks_array_size);
 
 	/* Fill all blocks with the same configuration */
 	for (i = 0 ; i < num_dcores ; i++) {
@@ -336,19 +336,19 @@ int hl_init_pb_with_mask(struct hl_device *hdev, u32 num_dcores,
  * @instance_offset: offset between instances
  * @pb_blocks: blocks array
  * @blocks_array_size: blocks array size
- * @regs_array: register array
- * @regs_array_size: register array size
+ * @user_regs_array: unsecured register array
+ * @user_regs_array_size: unsecured register array size
  *
  */
 int hl_init_pb(struct hl_device *hdev, u32 num_dcores, u32 dcore_offset,
 		u32 num_instances, u32 instance_offset,
 		const u32 pb_blocks[], u32 blocks_array_size,
-		const u32 *regs_array, u32 regs_array_size)
+		const u32 *user_regs_array, u32 user_regs_array_size)
 {
 	return hl_init_pb_with_mask(hdev, num_dcores, dcore_offset,
 			num_instances, instance_offset, pb_blocks,
-			blocks_array_size, regs_array, regs_array_size,
-			ULLONG_MAX);
+			blocks_array_size, user_regs_array,
+			user_regs_array_size, ULLONG_MAX);
 }
 
 /**
@@ -364,15 +364,15 @@ int hl_init_pb(struct hl_device *hdev, u32 num_dcores, u32 dcore_offset,
  * @instance_offset: offset between instances
  * @pb_blocks: blocks array
  * @blocks_array_size: blocks array size
- * @regs_range_array: register range array
- * @regs_range_array_size: register range array size
+ * @user_regs_range_array: unsecured register range array
+ * @user_regs_range_array_size: unsecured register range array size
  * @mask: enabled instances mask: 1- enabled, 0- disabled
  */
 int hl_init_pb_ranges_with_mask(struct hl_device *hdev, u32 num_dcores,
 		u32 dcore_offset, u32 num_instances, u32 instance_offset,
 		const u32 pb_blocks[], u32 blocks_array_size,
-		const struct range *regs_range_array, u32 regs_range_array_size,
-		u64 mask)
+		const struct range *user_regs_range_array,
+		u32 user_regs_range_array_size, u64 mask)
 {
 	int i, j, rc = 0;
 	struct hl_block_glbl_sec *glbl_sec;
@@ -384,8 +384,8 @@ int hl_init_pb_ranges_with_mask(struct hl_device *hdev, u32 num_dcores,
 		return -ENOMEM;
 
 	hl_secure_block(hdev, glbl_sec, blocks_array_size);
-	rc = hl_unsecure_registers_range(hdev, regs_range_array,
-			regs_range_array_size, 0, pb_blocks, glbl_sec,
+	rc = hl_unsecure_registers_range(hdev, user_regs_range_array,
+			user_regs_range_array_size, 0, pb_blocks, glbl_sec,
 			blocks_array_size);
 	if (rc)
 		goto free_glbl_sec;
@@ -422,19 +422,20 @@ int hl_init_pb_ranges_with_mask(struct hl_device *hdev, u32 num_dcores,
  * @instance_offset: offset between instances
  * @pb_blocks: blocks array
  * @blocks_array_size: blocks array size
- * @regs_range_array: register range array
- * @regs_range_array_size: register range array size
+ * @user_regs_range_array: unsecured register range array
+ * @user_regs_range_array_size: unsecured register range array size
  *
  */
 int hl_init_pb_ranges(struct hl_device *hdev, u32 num_dcores,
 		u32 dcore_offset, u32 num_instances, u32 instance_offset,
 		const u32 pb_blocks[], u32 blocks_array_size,
-		const struct range *regs_range_array, u32 regs_range_array_size)
+		const struct range *user_regs_range_array,
+		u32 user_regs_range_array_size)
 {
 	return hl_init_pb_ranges_with_mask(hdev, num_dcores, dcore_offset,
 			num_instances, instance_offset, pb_blocks,
-			blocks_array_size, regs_range_array,
-			regs_range_array_size, ULLONG_MAX);
+			blocks_array_size, user_regs_range_array,
+			user_regs_range_array_size, ULLONG_MAX);
 }
 
 /**
@@ -447,14 +448,14 @@ int hl_init_pb_ranges(struct hl_device *hdev, u32 num_dcores,
  * @instance_offset: offset between instances
  * @pb_blocks: blocks array
  * @blocks_array_size: blocks array size
- * @regs_array: register array
- * @regs_array_size: register array size
+ * @user_regs_array: unsecured register array
+ * @user_regs_array_size: unsecured register array size
  *
  */
 int hl_init_pb_single_dcore(struct hl_device *hdev, u32 dcore_offset,
 		u32 num_instances, u32 instance_offset,
 		const u32 pb_blocks[], u32 blocks_array_size,
-		const u32 *regs_array, u32 regs_array_size)
+		const u32 *user_regs_array, u32 user_regs_array_size)
 {
 	int i, rc = 0;
 	struct hl_block_glbl_sec *glbl_sec;
@@ -466,8 +467,8 @@ int hl_init_pb_single_dcore(struct hl_device *hdev, u32 dcore_offset,
 		return -ENOMEM;
 
 	hl_secure_block(hdev, glbl_sec, blocks_array_size);
-	rc = hl_unsecure_registers(hdev, regs_array, regs_array_size, 0,
-			pb_blocks, glbl_sec, blocks_array_size);
+	rc = hl_unsecure_registers(hdev, user_regs_array, user_regs_array_size,
+			0, pb_blocks, glbl_sec, blocks_array_size);
 	if (rc)
 		goto free_glbl_sec;
 
@@ -495,8 +496,8 @@ int hl_init_pb_single_dcore(struct hl_device *hdev, u32 dcore_offset,
  * @instance_offset: offset between instances
  * @pb_blocks: blocks array
  * @blocks_array_size: blocks array size
- * @user_regs_range_array: register range array
- * @user_regs_range_array_size: register range array size
+ * @user_regs_range_array: unsecured register range array
+ * @user_regs_range_array_size: unsecured register range array size
  *
  */
 int hl_init_pb_ranges_single_dcore(struct hl_device *hdev, u32 dcore_offset,
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 02/12] accel/habanalabs: set unused bit as reserved
  2023-05-16  9:30 [PATCH 01/12] accel/habanalabs: rename security functions related arguments Oded Gabbay
@ 2023-05-16  9:30 ` Oded Gabbay
  2023-05-17 18:03   ` Ofir Bitton
  2023-05-16  9:30 ` [PATCH 03/12] accel/habanalabs: fix mem leak in capture user mappings Oded Gabbay
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 15+ messages in thread
From: Oded Gabbay @ 2023-05-16  9:30 UTC (permalink / raw)
  To: dri-devel

Get latest f/w gaudi2 interface file which marks unused
bist_need_iatu_config bit in cold_rst_data structure as reserved bit.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h b/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h
index 8522f24deac0..18ca147b1c86 100644
--- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h
+++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h
@@ -62,7 +62,7 @@ struct gaudi2_cold_rst_data {
 			u32 fake_security_enable : 1;
 			u32 fake_sig_validation_en : 1;
 			u32 bist_skip_enable : 1;
-			u32 bist_need_iatu_config : 1;
+			u32 reserved1 : 1;
 			u32 fake_bis_compliant : 1;
 			u32 wd_rst_cause_arm : 1;
 			u32 wd_rst_cause_arcpid : 1;
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 03/12] accel/habanalabs: fix mem leak in capture user mappings
  2023-05-16  9:30 [PATCH 01/12] accel/habanalabs: rename security functions related arguments Oded Gabbay
  2023-05-16  9:30 ` [PATCH 02/12] accel/habanalabs: set unused bit as reserved Oded Gabbay
@ 2023-05-16  9:30 ` Oded Gabbay
  2023-05-16  9:30 ` [PATCH 04/12] accel/habanalabs: align to latest firmware specs Oded Gabbay
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 15+ messages in thread
From: Oded Gabbay @ 2023-05-16  9:30 UTC (permalink / raw)
  To: dri-devel; +Cc: Moti Haimovski, Dani Liberman

From: Moti Haimovski <mhaimovski@habana.ai>

This commit fixes a memory leak caused when clearing the user_mappings
info when a new context is opened immediately after user_mapping is
captured and a hard reset is performed.

Signed-off-by: Moti Haimovski <mhaimovski@habana.ai>
Reviewed-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/habanalabs_drv.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c
index 1ec97da3dddb..70fb2df9a93b 100644
--- a/drivers/accel/habanalabs/common/habanalabs_drv.c
+++ b/drivers/accel/habanalabs/common/habanalabs_drv.c
@@ -13,6 +13,7 @@
 
 #include <linux/pci.h>
 #include <linux/module.h>
+#include <linux/vmalloc.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/habanalabs.h>
@@ -218,6 +219,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
 
 	hl_debugfs_add_file(hpriv);
 
+	vfree(hdev->captured_err_info.page_fault_info.user_mappings);
 	memset(&hdev->captured_err_info, 0, sizeof(hdev->captured_err_info));
 	atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1);
 	hdev->captured_err_info.undef_opcode.write_enable = true;
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 04/12] accel/habanalabs: align to latest firmware specs
  2023-05-16  9:30 [PATCH 01/12] accel/habanalabs: rename security functions related arguments Oded Gabbay
  2023-05-16  9:30 ` [PATCH 02/12] accel/habanalabs: set unused bit as reserved Oded Gabbay
  2023-05-16  9:30 ` [PATCH 03/12] accel/habanalabs: fix mem leak in capture user mappings Oded Gabbay
@ 2023-05-16  9:30 ` Oded Gabbay
  2023-05-17 18:03   ` Ofir Bitton
  2023-05-16  9:30 ` [PATCH 05/12] accel/habanalabs: print max timeout value on CS stuck Oded Gabbay
                   ` (7 subsequent siblings)
  10 siblings, 1 reply; 15+ messages in thread
From: Oded Gabbay @ 2023-05-16  9:30 UTC (permalink / raw)
  To: dri-devel

Update the firmware common interface files with the latest version.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/include/common/cpucp_if.h      | 18 ++++----
 .../habanalabs/include/common/hl_boot_if.h    | 41 ++++---------------
 2 files changed, 16 insertions(+), 43 deletions(-)

diff --git a/drivers/accel/habanalabs/include/common/cpucp_if.h b/drivers/accel/habanalabs/include/common/cpucp_if.h
index f68308cc2524..33807b839c37 100644
--- a/drivers/accel/habanalabs/include/common/cpucp_if.h
+++ b/drivers/accel/habanalabs/include/common/cpucp_if.h
@@ -359,7 +359,7 @@ struct hl_eq_entry {
 	union {
 		__le64 data_placeholder;
 		struct hl_eq_ecc_data ecc_data;
-		struct hl_eq_hbm_ecc_data hbm_ecc_data;	/* Gaudi1 HBM */
+		struct hl_eq_hbm_ecc_data hbm_ecc_data;	/* Obsolete */
 		struct hl_eq_sm_sei_data sm_sei_data;
 		struct cpucp_pkt_sync_err pkt_sync_err;
 		struct hl_eq_fw_alive fw_alive;
@@ -653,7 +653,7 @@ enum pq_init_status {
  *       which address is passed via the CpuCp packet. In addition, the host's driver
  *       passes the max size it allows the CpuCP to write to the structure, to prevent
  *       data corruption in case of mismatched driver/FW versions.
- *       Relevant only to Gaudi.
+ *       Obsolete.
  *
  * CPUCP_PACKET_GENERIC_PASSTHROUGH -
  *      Generic opcode for all firmware info that is only passed to host
@@ -868,19 +868,19 @@ struct cpucp_array_data_packet {
 enum cpucp_led_index {
 	CPUCP_LED0_INDEX = 0,
 	CPUCP_LED1_INDEX,
-	CPUCP_LED2_INDEX
+	CPUCP_LED2_INDEX,
+	CPUCP_LED_MAX_INDEX = CPUCP_LED2_INDEX
 };
 
 /*
  * enum cpucp_packet_rc - Error return code
  * @cpucp_packet_success	-> in case of success.
- * @cpucp_packet_invalid	-> this is to support Goya and Gaudi platform.
+ * @cpucp_packet_invalid	-> this is to support first generation platforms.
  * @cpucp_packet_fault		-> in case of processing error like failing to
  *                                 get device binding or semaphore etc.
- * @cpucp_packet_invalid_pkt	-> when cpucp packet is un-supported. This is
- *                                 supported Greco onwards.
+ * @cpucp_packet_invalid_pkt	-> when cpucp packet is un-supported.
  * @cpucp_packet_invalid_params	-> when checking parameter like length of buffer
- *				   or attribute value etc. Supported Greco onwards.
+ *				   or attribute value etc.
  * @cpucp_packet_rc_max		-> It indicates size of enum so should be at last.
  */
 enum cpucp_packet_rc {
@@ -1365,7 +1365,7 @@ struct cpucp_dev_info_signed {
 #define DCORE_MON_REGS_SZ	512
 /*
  * struct dcore_monitor_regs_data - DCORE monitor regs data.
- * the structure follows sync manager block layout. relevant only to Gaudi.
+ * the structure follows sync manager block layout. Obsolete.
  * @mon_pay_addrl: array of payload address low bits.
  * @mon_pay_addrh: array of payload address high bits.
  * @mon_pay_data: array of payload data.
@@ -1380,7 +1380,7 @@ struct dcore_monitor_regs_data {
 	__le32 mon_status[DCORE_MON_REGS_SZ];
 };
 
-/* contains SM data for each SYNC_MNGR (relevant only to Gaudi) */
+/* contains SM data for each SYNC_MNGR (Obsolete) */
 struct cpucp_monitor_dump {
 	struct dcore_monitor_regs_data sync_mngr_w_s;
 	struct dcore_monitor_regs_data sync_mngr_e_s;
diff --git a/drivers/accel/habanalabs/include/common/hl_boot_if.h b/drivers/accel/habanalabs/include/common/hl_boot_if.h
index c58d76a2705c..cff79f7f9f75 100644
--- a/drivers/accel/habanalabs/include/common/hl_boot_if.h
+++ b/drivers/accel/habanalabs/include/common/hl_boot_if.h
@@ -35,6 +35,7 @@ enum cpu_boot_err {
 	CPU_BOOT_ERR_TPM_FAIL = 20,
 	CPU_BOOT_ERR_TMP_THRESH_INIT_FAIL = 21,
 	CPU_BOOT_ERR_EEPROM_FAIL = 22,
+	CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL = 23,
 	CPU_BOOT_ERR_ENABLED = 31,
 	CPU_BOOT_ERR_SCND_EN = 63,
 	CPU_BOOT_ERR_LAST = 64 /* we have 2 registers of 32 bits */
@@ -51,6 +52,7 @@ enum cpu_boot_err {
 		 (1 << CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL) |	\
 		 (1 << CPU_BOOT_ERR_BINNING_FAIL) |		\
 		 (1 << CPU_BOOT_ERR_DRAM_SKIPPED) |		\
+		 (1 << CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL) |	\
 		 (1 << CPU_BOOT_ERR_EEPROM_FAIL))
 
 /*
@@ -132,6 +134,9 @@ enum cpu_boot_err {
  * CPU_BOOT_ERR_EEPROM_FAIL		Failed reading EEPROM data. Defaults
  *					are used.
  *
+ * CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL	Failed scrubbing the Engines/ARCFarm
+ *					memories. Boot disabled until reset.
+ *
  * CPU_BOOT_ERR0_ENABLED		Error registers enabled.
  *					This is a main indication that the
  *					running FW populates the error
@@ -157,6 +162,7 @@ enum cpu_boot_err {
 #define CPU_BOOT_ERR0_TPM_FAIL			(1 << CPU_BOOT_ERR_TPM_FAIL)
 #define CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL	(1 << CPU_BOOT_ERR_TMP_THRESH_INIT_FAIL)
 #define CPU_BOOT_ERR0_EEPROM_FAIL		(1 << CPU_BOOT_ERR_EEPROM_FAIL)
+#define CPU_BOOT_ERR0_ENG_ARC_MEM_SCRUB_FAIL	(1 << CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL)
 #define CPU_BOOT_ERR0_ENABLED			(1 << CPU_BOOT_ERR_ENABLED)
 #define CPU_BOOT_ERR1_ENABLED			(1 << CPU_BOOT_ERR_ENABLED)
 
@@ -744,36 +750,6 @@ struct comms_status {
 	};
 };
 
-/**
- * HL_MODULES_MAX_NUM is determined by the size of modules_mask in struct
- *      hl_component_versions
- */
-enum hl_modules {
-	HL_MODULES_BOOT_INFO = 0,
-	HL_MODULES_EEPROM,
-	HL_MODULES_FDT,
-	HL_MODULES_I2C,
-	HL_MODULES_LZ4,
-	HL_MODULES_MBEDTLS,
-	HL_MODULES_MAX_NUM = 16
-};
-
-/**
- * HL_COMPONENTS_MAX_NUM is determined by the size of components_mask in
- *      struct cpucp_versions
- */
-enum hl_components {
-	HL_COMPONENTS_PID = 0,
-	HL_COMPONENTS_MGMT,
-	HL_COMPONENTS_PREBOOT,
-	HL_COMPONENTS_PPBOOT,
-	HL_COMPONENTS_ARMCP,
-	HL_COMPONENTS_CPLD,
-	HL_COMPONENTS_UBOOT,
-	HL_COMPONENTS_FUSE,
-	HL_COMPONENTS_MAX_NUM = 16
-};
-
 #define NAME_MAX_LEN	32 /* bytes */
 struct hl_module_data {
 	__u8 name[NAME_MAX_LEN];
@@ -787,8 +763,6 @@ struct hl_module_data {
  * @component: version of the component itself.
  * @fw_os: Firmware OS Version.
  * @comp_name: Name of the component.
- * @modules_mask: i'th bit (from LSB) is a flag - on if module i in enum
- *              hl_modules is used.
  * @modules_counter: number of set bits in modules_mask.
  * @reserved: reserved for future use.
  * @modules: versions of the component's modules. Elborated explanation in
@@ -800,9 +774,8 @@ struct hl_component_versions {
 	__u8 component[VERSION_MAX_LEN];
 	__u8 fw_os[VERSION_MAX_LEN];
 	__u8 comp_name[NAME_MAX_LEN];
-	__le16 modules_mask;
 	__u8 modules_counter;
-	__u8 reserved[1];
+	__u8 reserved[3];
 	struct hl_module_data modules[];
 };
 
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 05/12] accel/habanalabs: print max timeout value on CS stuck
  2023-05-16  9:30 [PATCH 01/12] accel/habanalabs: rename security functions related arguments Oded Gabbay
                   ` (2 preceding siblings ...)
  2023-05-16  9:30 ` [PATCH 04/12] accel/habanalabs: align to latest firmware specs Oded Gabbay
@ 2023-05-16  9:30 ` Oded Gabbay
  2023-05-17 18:01   ` Ofir Bitton
  2023-05-16  9:30 ` [PATCH 06/12] accel/habanalabs: upon DMA errors, use FW-extracted error cause Oded Gabbay
                   ` (6 subsequent siblings)
  10 siblings, 1 reply; 15+ messages in thread
From: Oded Gabbay @ 2023-05-16  9:30 UTC (permalink / raw)
  To: dri-devel

If a workload got stuck, we print an error to the kernel log about it.
Add to that print the configured max timeout value, as that value is
not fixed between ASICs and in addition it can be configured using
a kernel module parameter.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c
index ccf68f482948..4ec28af3ed78 100644
--- a/drivers/accel/habanalabs/common/command_submission.c
+++ b/drivers/accel/habanalabs/common/command_submission.c
@@ -804,12 +804,14 @@ static void cs_do_release(struct kref *ref)
 
 static void cs_timedout(struct work_struct *work)
 {
+	struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work);
+	bool skip_reset_on_timeout, device_reset = false;
 	struct hl_device *hdev;
 	u64 event_mask = 0x0;
+	uint timeout_sec;
 	int rc;
-	struct hl_cs *cs = container_of(work, struct hl_cs,
-						 work_tdr.work);
-	bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false;
+
+	skip_reset_on_timeout = cs->skip_reset_on_timeout;
 
 	rc = cs_get_unless_zero(cs);
 	if (!rc)
@@ -840,29 +842,31 @@ static void cs_timedout(struct work_struct *work)
 		event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT;
 	}
 
+	timeout_sec = jiffies_to_msecs(hdev->timeout_jiffies) / 1000;
+
 	switch (cs->type) {
 	case CS_TYPE_SIGNAL:
 		dev_err(hdev->dev,
-			"Signal command submission %llu has not finished in time!\n",
-			cs->sequence);
+			"Signal command submission %llu has not finished in %u seconds!\n",
+			cs->sequence, timeout_sec);
 		break;
 
 	case CS_TYPE_WAIT:
 		dev_err(hdev->dev,
-			"Wait command submission %llu has not finished in time!\n",
-			cs->sequence);
+			"Wait command submission %llu has not finished in %u seconds!\n",
+			cs->sequence, timeout_sec);
 		break;
 
 	case CS_TYPE_COLLECTIVE_WAIT:
 		dev_err(hdev->dev,
-			"Collective Wait command submission %llu has not finished in time!\n",
-			cs->sequence);
+			"Collective Wait command submission %llu has not finished in %u seconds!\n",
+			cs->sequence, timeout_sec);
 		break;
 
 	default:
 		dev_err(hdev->dev,
-			"Command submission %llu has not finished in time!\n",
-			cs->sequence);
+			"Command submission %llu has not finished in %u seconds!\n",
+			cs->sequence, timeout_sec);
 		break;
 	}
 
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 06/12] accel/habanalabs: upon DMA errors, use FW-extracted error cause
  2023-05-16  9:30 [PATCH 01/12] accel/habanalabs: rename security functions related arguments Oded Gabbay
                   ` (3 preceding siblings ...)
  2023-05-16  9:30 ` [PATCH 05/12] accel/habanalabs: print max timeout value on CS stuck Oded Gabbay
@ 2023-05-16  9:30 ` Oded Gabbay
  2023-05-16  9:30 ` [PATCH 07/12] accel/habanalabs: remove support for mmu disable Oded Gabbay
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 15+ messages in thread
From: Oded Gabbay @ 2023-05-16  9:30 UTC (permalink / raw)
  To: dri-devel; +Cc: Koby Elbaz

From: Koby Elbaz <kelbaz@habana.ai>

Initially, the driver used to read the error cause data directly from
the ASIC. However, the FW now clears it before the driver could read
it. Therefore we should use the error cause data that is extracted by
the FW.

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 37 +++++-------------------
 1 file changed, 8 insertions(+), 29 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index e900017f4ff7..b8644d87f817 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -8807,13 +8807,13 @@ static int gaudi2_handle_kdma_core_event(struct hl_device *hdev, u16 event_type,
 	return error_count;
 }
 
-static int gaudi2_handle_dma_core_event(struct hl_device *hdev, u16 event_type, int sts_addr)
+static int gaudi2_handle_dma_core_event(struct hl_device *hdev, u16 event_type, u64 intr_cause)
 {
-	u32 error_count = 0, sts_val = RREG32(sts_addr);
+	u32 error_count = 0;
 	int i;
 
 	for (i = 0 ; i < GAUDI2_NUM_OF_DMA_CORE_INTR_CAUSE ; i++)
-		if (sts_val & BIT(i)) {
+		if (intr_cause & BIT(i)) {
 			gaudi2_print_event(hdev, event_type, true,
 				"err cause: %s", gaudi2_dma_core_interrupts_cause[i]);
 			error_count++;
@@ -8824,27 +8824,6 @@ static int gaudi2_handle_dma_core_event(struct hl_device *hdev, u16 event_type,
 	return error_count;
 }
 
-static int gaudi2_handle_pdma_core_event(struct hl_device *hdev, u16 event_type, int pdma_idx)
-{
-	u32 sts_addr;
-
-	sts_addr = mmPDMA0_CORE_ERR_CAUSE + pdma_idx * PDMA_OFFSET;
-	return gaudi2_handle_dma_core_event(hdev, event_type, sts_addr);
-}
-
-static int gaudi2_handle_edma_core_event(struct hl_device *hdev, u16 event_type, int edma_idx)
-{
-	static const int edma_event_index_map[] = {2, 3, 0, 1, 6, 7, 4, 5};
-	u32 sts_addr, index;
-
-	index = edma_event_index_map[edma_idx];
-
-	sts_addr = mmDCORE0_EDMA0_CORE_ERR_CAUSE +
-				DCORE_OFFSET * (index / NUM_OF_EDMA_PER_DCORE) +
-				DCORE_EDMA_OFFSET * (index % NUM_OF_EDMA_PER_DCORE);
-	return gaudi2_handle_dma_core_event(hdev, event_type, sts_addr);
-}
-
 static void gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(struct hl_device *hdev, u64 *event_mask)
 {
 	u32 mstr_if_base_addr = mmPCIE_MSTR_RR_MSTR_IF_RR_SHRD_HBW_BASE, razwi_happened_addr;
@@ -9725,19 +9704,19 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 	case GAUDI2_EVENT_KDMA_CH0_AXI_ERR_RSP:
 	case GAUDI2_EVENT_KDMA0_CORE:
 		error_count = gaudi2_handle_kdma_core_event(hdev, event_type,
-					le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
+				le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
 		break;
 
 	case GAUDI2_EVENT_HDMA2_CORE ... GAUDI2_EVENT_HDMA5_CORE:
-		index = event_type - GAUDI2_EVENT_HDMA2_CORE;
-		error_count = gaudi2_handle_edma_core_event(hdev, event_type, index);
+		error_count = gaudi2_handle_dma_core_event(hdev, event_type,
+				le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
 	case GAUDI2_EVENT_PDMA0_CORE ... GAUDI2_EVENT_PDMA1_CORE:
-		index = event_type - GAUDI2_EVENT_PDMA0_CORE;
-		error_count = gaudi2_handle_pdma_core_event(hdev, event_type, index);
+		error_count = gaudi2_handle_dma_core_event(hdev, event_type,
+				le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 07/12] accel/habanalabs: remove support for mmu disable
  2023-05-16  9:30 [PATCH 01/12] accel/habanalabs: rename security functions related arguments Oded Gabbay
                   ` (4 preceding siblings ...)
  2023-05-16  9:30 ` [PATCH 06/12] accel/habanalabs: upon DMA errors, use FW-extracted error cause Oded Gabbay
@ 2023-05-16  9:30 ` Oded Gabbay
  2023-05-16  9:30 ` [PATCH 08/12] accel/habanalabs: use binning info when handling razwi Oded Gabbay
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 15+ messages in thread
From: Oded Gabbay @ 2023-05-16  9:30 UTC (permalink / raw)
  To: dri-devel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

As mmu disable mode is only used for bring-up stages, let's remove this
option and all code related to it.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../accel/habanalabs/common/command_buffer.c  |   6 -
 .../habanalabs/common/command_submission.c    |  22 ++--
 drivers/accel/habanalabs/common/debugfs.c     |  23 +---
 drivers/accel/habanalabs/common/habanalabs.h  |  18 +--
 .../accel/habanalabs/common/habanalabs_drv.c  |   2 -
 .../habanalabs/common/habanalabs_ioctl.c      |   9 +-
 drivers/accel/habanalabs/common/memory.c      | 104 +-----------------
 drivers/accel/habanalabs/common/mmu/mmu.c     |  56 ++--------
 drivers/accel/habanalabs/gaudi/gaudi.c        |   6 +-
 drivers/accel/habanalabs/goya/goya.c          |   3 -
 .../accel/habanalabs/goya/goya_coresight.c    |   9 +-
 11 files changed, 26 insertions(+), 232 deletions(-)

diff --git a/drivers/accel/habanalabs/common/command_buffer.c b/drivers/accel/habanalabs/common/command_buffer.c
index 6e09f48750a0..08f7aee42624 100644
--- a/drivers/accel/habanalabs/common/command_buffer.c
+++ b/drivers/accel/habanalabs/common/command_buffer.c
@@ -27,12 +27,6 @@ static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb)
 		return -EINVAL;
 	}
 
-	if (!hdev->mmu_enable) {
-		dev_err_ratelimited(hdev->dev,
-				"Cannot map CB because MMU is disabled\n");
-		return -EINVAL;
-	}
-
 	if (cb->is_mmu_mapped)
 		return 0;
 
diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c
index 4ec28af3ed78..104c3ce60655 100644
--- a/drivers/accel/habanalabs/common/command_submission.c
+++ b/drivers/accel/habanalabs/common/command_submission.c
@@ -280,14 +280,8 @@ bool cs_needs_timeout(struct hl_cs *cs)
 
 static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
 {
-	/*
-	 * Patched CB is created for external queues jobs, and for H/W queues
-	 * jobs if the user CB was allocated by driver and MMU is disabled.
-	 */
-	return (job->queue_type == QUEUE_TYPE_EXT ||
-			(job->queue_type == QUEUE_TYPE_HW &&
-					job->is_kernel_allocated_cb &&
-					!hdev->mmu_enable));
+	/* Patched CB is created for external queues jobs */
+	return (job->queue_type == QUEUE_TYPE_EXT);
 }
 
 /*
@@ -363,14 +357,13 @@ static void hl_complete_job(struct hl_device *hdev, struct hl_cs_job *job)
 		}
 	}
 
-	/* For H/W queue jobs, if a user CB was allocated by driver and MMU is
-	 * enabled, the user CB isn't released in cs_parser() and thus should be
+	/* For H/W queue jobs, if a user CB was allocated by driver,
+	 * the user CB isn't released in cs_parser() and thus should be
 	 * released here. This is also true for INT queues jobs which were
 	 * allocated by driver.
 	 */
-	if ((job->is_kernel_allocated_cb &&
-		((job->queue_type == QUEUE_TYPE_HW && hdev->mmu_enable) ||
-				job->queue_type == QUEUE_TYPE_INT))) {
+	if (job->is_kernel_allocated_cb &&
+			(job->queue_type == QUEUE_TYPE_HW || job->queue_type == QUEUE_TYPE_INT)) {
 		atomic_dec(&job->user_cb->cs_cnt);
 		hl_cb_put(job->user_cb);
 	}
@@ -1951,8 +1944,7 @@ static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
 	else
 		cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);
 
-	cb = hl_cb_kernel_create(hdev, cb_size,
-				q_type == QUEUE_TYPE_HW && hdev->mmu_enable);
+	cb = hl_cb_kernel_create(hdev, cb_size, q_type == QUEUE_TYPE_HW);
 	if (!cb) {
 		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
 		atomic64_inc(&cntr->out_of_mem_drop_cnt);
diff --git a/drivers/accel/habanalabs/common/debugfs.c b/drivers/accel/habanalabs/common/debugfs.c
index 29b78c7cf5de..9e84a47a21dc 100644
--- a/drivers/accel/habanalabs/common/debugfs.c
+++ b/drivers/accel/habanalabs/common/debugfs.c
@@ -255,9 +255,6 @@ static int vm_show(struct seq_file *s, void *data)
 	u64 j;
 	int i;
 
-	if (!dev_entry->hdev->mmu_enable)
-		return 0;
-
 	mutex_lock(&dev_entry->ctx_mem_hash_mutex);
 
 	list_for_each_entry(ctx, &dev_entry->ctx_mem_hash_list, debugfs_list) {
@@ -436,9 +433,6 @@ static int mmu_show(struct seq_file *s, void *data)
 	u64 virt_addr = dev_entry->mmu_addr, phys_addr;
 	int i;
 
-	if (!hdev->mmu_enable)
-		return 0;
-
 	if (dev_entry->mmu_asid == HL_KERNEL_ASID_ID)
 		ctx = hdev->kernel_ctx;
 	else
@@ -496,9 +490,6 @@ static ssize_t mmu_asid_va_write(struct file *file, const char __user *buf,
 	char *c;
 	ssize_t rc;
 
-	if (!hdev->mmu_enable)
-		return count;
-
 	if (count > sizeof(kbuf) - 1)
 		goto err;
 	if (copy_from_user(kbuf, buf, count))
@@ -535,9 +526,6 @@ static int mmu_ack_error(struct seq_file *s, void *data)
 	struct hl_device *hdev = dev_entry->hdev;
 	int rc;
 
-	if (!hdev->mmu_enable)
-		return 0;
-
 	if (!dev_entry->mmu_cap_mask) {
 		dev_err(hdev->dev, "mmu_cap_mask is not set\n");
 		goto err;
@@ -563,9 +551,6 @@ static ssize_t mmu_ack_error_value_write(struct file *file,
 	char kbuf[MMU_KBUF_SIZE];
 	ssize_t rc;
 
-	if (!hdev->mmu_enable)
-		return count;
-
 	if (count > sizeof(kbuf) - 1)
 		goto err;
 
@@ -661,9 +646,6 @@ static bool hl_is_device_va(struct hl_device *hdev, u64 addr)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 
-	if (!hdev->mmu_enable)
-		goto out;
-
 	if (prop->dram_supports_virtual_memory &&
 		(addr >= prop->dmmu.start_addr && addr < prop->dmmu.end_addr))
 		return true;
@@ -675,7 +657,7 @@ static bool hl_is_device_va(struct hl_device *hdev, u64 addr)
 	if (addr >= prop->pmmu_huge.start_addr &&
 		addr < prop->pmmu_huge.end_addr)
 		return true;
-out:
+
 	return false;
 }
 
@@ -685,9 +667,6 @@ static bool hl_is_device_internal_memory_va(struct hl_device *hdev, u64 addr,
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	u64 dram_start_addr, dram_end_addr;
 
-	if (!hdev->mmu_enable)
-		return false;
-
 	if (prop->dram_supports_virtual_memory) {
 		dram_start_addr = prop->dmmu.start_addr;
 		dram_end_addr = prop->dmmu.end_addr;
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index ea0914d08bdc..e2341a75a4b7 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -115,18 +115,6 @@ enum hl_mmu_page_table_location {
 	MMU_NUM_PGT_LOCATIONS	/* num of PGT locations */
 };
 
-/**
- * enum hl_mmu_enablement - what mmu modules to enable
- * @MMU_EN_NONE: mmu disabled.
- * @MMU_EN_ALL: enable all.
- * @MMU_EN_PMMU_ONLY: Enable only the PMMU leaving the DMMU disabled.
- */
-enum hl_mmu_enablement {
-	MMU_EN_NONE = 0,
-	MMU_EN_ALL = 1,
-	MMU_EN_PMMU_ONLY = 3,	/* N/A for Goya/Gaudi */
-};
-
 /*
  * HL_RSVD_SOBS 'sync stream' reserved sync objects per QMAN stream
  * HL_RSVD_MONS 'sync stream' reserved monitors per QMAN stream
@@ -3319,7 +3307,7 @@ struct hl_reset_info {
  * @nic_ports_mask: Controls which NIC ports are enabled. Used only for testing.
  * @fw_components: Controls which f/w components to load to the device. There are multiple f/w
  *                 stages and sometimes we want to stop at a certain stage. Used only for testing.
- * @mmu_enable: Whether to enable or disable the device MMU(s). Used only for testing.
+ * @mmu_disable: Disable the device MMU(s). Used only for testing.
  * @cpu_queues_enable: Whether to enable queues communication vs. the f/w. Used only for testing.
  * @pldm: Whether we are running in Palladium environment. Used only for testing.
  * @hard_reset_on_fw_events: Whether to do device hard-reset when a fatal event is received from
@@ -3482,7 +3470,7 @@ struct hl_device {
 	/* Parameters for bring-up to be upstreamed */
 	u64				nic_ports_mask;
 	u64				fw_components;
-	u8				mmu_enable;
+	u8				mmu_disable;
 	u8				cpu_queues_enable;
 	u8				pldm;
 	u8				hard_reset_on_fw_events;
@@ -3827,8 +3815,6 @@ struct pgt_info *hl_mmu_hr_get_alloc_next_hop(struct hl_ctx *ctx,
 							u64 curr_pte, bool *is_new_hop);
 int hl_mmu_hr_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_info *hops,
 							struct hl_hr_mmu_funcs *hr_func);
-void hl_mmu_swap_out(struct hl_ctx *ctx);
-void hl_mmu_swap_in(struct hl_ctx *ctx);
 int hl_mmu_if_set_funcs(struct hl_device *hdev);
 void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
 void hl_mmu_v2_hr_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c
index 70fb2df9a93b..446f444a1c7e 100644
--- a/drivers/accel/habanalabs/common/habanalabs_drv.c
+++ b/drivers/accel/habanalabs/common/habanalabs_drv.c
@@ -307,7 +307,6 @@ static void set_driver_behavior_per_device(struct hl_device *hdev)
 {
 	hdev->nic_ports_mask = 0;
 	hdev->fw_components = FW_TYPE_ALL_TYPES;
-	hdev->mmu_enable = MMU_EN_ALL;
 	hdev->cpu_queues_enable = 1;
 	hdev->pldm = 0;
 	hdev->hard_reset_on_fw_events = 1;
@@ -382,7 +381,6 @@ static int fixup_device_params(struct hl_device *hdev)
 	/* If CPU queues not enabled, no way to do heartbeat */
 	if (!hdev->cpu_queues_enable)
 		hdev->heartbeat = 0;
-
 	fixup_device_params_per_asic(hdev, tmp_timeout);
 
 	return 0;
diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
index 4368e6c9a23a..9a8be9395fb2 100644
--- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
@@ -62,7 +62,7 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 	hw_ip.device_id = hdev->asic_funcs->get_pci_id(hdev);
 	hw_ip.sram_base_address = prop->sram_user_base_address;
 	hw_ip.dram_base_address =
-			hdev->mmu_enable && prop->dram_supports_virtual_memory ?
+			prop->dram_supports_virtual_memory ?
 			prop->dmmu.start_addr : prop->dram_user_base_address;
 	hw_ip.tpc_enabled_mask = prop->tpc_enabled_mask & 0xFF;
 	hw_ip.tpc_enabled_mask_ext = prop->tpc_enabled_mask;
@@ -71,11 +71,8 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 
 	dram_available_size = prop->dram_size - dram_kmd_size;
 
-	if (hdev->mmu_enable == MMU_EN_ALL)
-		hw_ip.dram_size = DIV_ROUND_DOWN_ULL(dram_available_size,
-				prop->dram_page_size) * prop->dram_page_size;
-	else
-		hw_ip.dram_size = dram_available_size;
+	hw_ip.dram_size = DIV_ROUND_DOWN_ULL(dram_available_size, prop->dram_page_size) *
+				prop->dram_page_size;
 
 	if (hw_ip.dram_size > PAGE_SIZE)
 		hw_ip.dram_enabled = 1;
diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c
index a7b6a273ce21..4fc72a07d2f5 100644
--- a/drivers/accel/habanalabs/common/memory.c
+++ b/drivers/accel/habanalabs/common/memory.c
@@ -1034,30 +1034,6 @@ static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
 	}
 }
 
-static int get_paddr_from_handle(struct hl_ctx *ctx, struct hl_mem_in *args,
-					u64 *paddr)
-{
-	struct hl_device *hdev = ctx->hdev;
-	struct hl_vm *vm = &hdev->vm;
-	struct hl_vm_phys_pg_pack *phys_pg_pack;
-	u32 handle;
-
-	handle = lower_32_bits(args->map_device.handle);
-	spin_lock(&vm->idr_lock);
-	phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);
-	if (!phys_pg_pack) {
-		spin_unlock(&vm->idr_lock);
-		dev_err(hdev->dev, "no match for handle %u\n", handle);
-		return -EINVAL;
-	}
-
-	*paddr = phys_pg_pack->pages[0];
-
-	spin_unlock(&vm->idr_lock);
-
-	return 0;
-}
-
 /**
  * map_device_va() - map the given memory.
  * @ctx: pointer to the context structure.
@@ -2094,76 +2070,6 @@ static int export_dmabuf_from_addr(struct hl_ctx *ctx, u64 addr, u64 size, u64 o
 	return rc;
 }
 
-static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args)
-{
-	struct hl_device *hdev = hpriv->hdev;
-	u64 block_handle, device_addr = 0;
-	struct hl_ctx *ctx = hpriv->ctx;
-	u32 handle = 0, block_size;
-	int rc;
-
-	switch (args->in.op) {
-	case HL_MEM_OP_ALLOC:
-		if (args->in.alloc.mem_size == 0) {
-			dev_err(hdev->dev, "alloc size must be larger than 0\n");
-			rc = -EINVAL;
-			goto out;
-		}
-
-		/* Force contiguous as there are no real MMU
-		 * translations to overcome physical memory gaps
-		 */
-		args->in.flags |= HL_MEM_CONTIGUOUS;
-		rc = alloc_device_memory(ctx, &args->in, &handle);
-
-		memset(args, 0, sizeof(*args));
-		args->out.handle = (__u64) handle;
-		break;
-
-	case HL_MEM_OP_FREE:
-		rc = free_device_memory(ctx, &args->in);
-		break;
-
-	case HL_MEM_OP_MAP:
-		if (args->in.flags & HL_MEM_USERPTR) {
-			dev_err(hdev->dev, "Failed to map host memory when MMU is disabled\n");
-			rc = -EPERM;
-		} else {
-			rc = get_paddr_from_handle(ctx, &args->in, &device_addr);
-			memset(args, 0, sizeof(*args));
-			args->out.device_virt_addr = device_addr;
-		}
-
-		break;
-
-	case HL_MEM_OP_UNMAP:
-		rc = 0;
-		break;
-
-	case HL_MEM_OP_MAP_BLOCK:
-		rc = map_block(hdev, args->in.map_block.block_addr, &block_handle, &block_size);
-		args->out.block_handle = block_handle;
-		args->out.block_size = block_size;
-		break;
-
-	case HL_MEM_OP_EXPORT_DMABUF_FD:
-		dev_err(hdev->dev, "Failed to export dma-buf object when MMU is disabled\n");
-		rc = -EPERM;
-		break;
-
-	case HL_MEM_OP_TS_ALLOC:
-		rc = allocate_timestamps_buffers(hpriv, &args->in, &args->out.handle);
-		break;
-	default:
-		dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");
-		rc = -EINVAL;
-		break;
-	}
-
-out:
-	return rc;
-}
-
 static void ts_buff_release(struct hl_mmap_mem_buf *buf)
 {
 	struct hl_ts_buff *ts_buff = buf->private;
@@ -2282,9 +2188,6 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
 		return -EBUSY;
 	}
 
-	if (!hdev->mmu_enable)
-		return mem_ioctl_no_mmu(hpriv, args);
-
 	switch (args->in.op) {
 	case HL_MEM_OP_ALLOC:
 		if (args->in.alloc.mem_size == 0) {
@@ -2779,13 +2682,10 @@ int hl_vm_ctx_init(struct hl_ctx *ctx)
 	atomic64_set(&ctx->dram_phys_mem, 0);
 
 	/*
-	 * - If MMU is enabled, init the ranges as usual.
-	 * - If MMU is disabled, in case of host mapping, the returned address
-	 *   is the given one.
 	 *   In case of DRAM mapping, the returned address is the physical
 	 *   address of the memory related to the given handle.
 	 */
-	if (!ctx->hdev->mmu_enable)
+	if (ctx->hdev->mmu_disable)
 		return 0;
 
 	dram_range_start = prop->dmmu.start_addr;
@@ -2835,7 +2735,7 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx)
 	struct hl_mem_in args;
 	int i;
 
-	if (!hdev->mmu_enable)
+	if (hdev->mmu_disable)
 		return;
 
 	hl_debugfs_remove_ctx_mem_hash(hdev, ctx);
diff --git a/drivers/accel/habanalabs/common/mmu/mmu.c b/drivers/accel/habanalabs/common/mmu/mmu.c
index f379e5b461a6..b2145716c605 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu.c
@@ -44,7 +44,7 @@ int hl_mmu_init(struct hl_device *hdev)
 {
 	int rc = -EOPNOTSUPP;
 
-	if (!hdev->mmu_enable)
+	if (hdev->mmu_disable)
 		return 0;
 
 	mutex_init(&hdev->mmu_lock);
@@ -82,7 +82,7 @@ int hl_mmu_init(struct hl_device *hdev)
  */
 void hl_mmu_fini(struct hl_device *hdev)
 {
-	if (!hdev->mmu_enable)
+	if (hdev->mmu_disable)
 		return;
 
 	if (hdev->mmu_func[MMU_DR_PGT].fini != NULL)
@@ -107,7 +107,7 @@ int hl_mmu_ctx_init(struct hl_ctx *ctx)
 	struct hl_device *hdev = ctx->hdev;
 	int rc = -EOPNOTSUPP;
 
-	if (!hdev->mmu_enable)
+	if (hdev->mmu_disable)
 		return 0;
 
 	if (hdev->mmu_func[MMU_DR_PGT].ctx_init != NULL) {
@@ -145,7 +145,7 @@ void hl_mmu_ctx_fini(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
 
-	if (!hdev->mmu_enable)
+	if (hdev->mmu_disable)
 		return;
 
 	if (hdev->mmu_func[MMU_DR_PGT].ctx_fini != NULL)
@@ -233,7 +233,7 @@ int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size, bool flu
 	u64 real_virt_addr;
 	bool is_dram_addr;
 
-	if (!hdev->mmu_enable)
+	if (hdev->mmu_disable)
 		return 0;
 
 	is_dram_addr = hl_is_dram_va(hdev, virt_addr);
@@ -301,7 +301,7 @@ int hl_mmu_map_page(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_s
 	bool is_dram_addr;
 
 
-	if (!hdev->mmu_enable)
+	if (hdev->mmu_disable)
 		return 0;
 
 	is_dram_addr = hl_is_dram_va(hdev, virt_addr);
@@ -472,46 +472,6 @@ int hl_mmu_unmap_contiguous(struct hl_ctx *ctx, u64 virt_addr, u32 size)
 	return rc;
 }
 
-/*
- * hl_mmu_swap_out - marks all mapping of the given ctx as swapped out
- *
- * @ctx: pointer to the context structure
- *
- */
-void hl_mmu_swap_out(struct hl_ctx *ctx)
-{
-	struct hl_device *hdev = ctx->hdev;
-
-	if (!hdev->mmu_enable)
-		return;
-
-	if (hdev->mmu_func[MMU_DR_PGT].swap_out != NULL)
-		hdev->mmu_func[MMU_DR_PGT].swap_out(ctx);
-
-	if (hdev->mmu_func[MMU_HR_PGT].swap_out != NULL)
-		hdev->mmu_func[MMU_HR_PGT].swap_out(ctx);
-}
-
-/*
- * hl_mmu_swap_in - marks all mapping of the given ctx as swapped in
- *
- * @ctx: pointer to the context structure
- *
- */
-void hl_mmu_swap_in(struct hl_ctx *ctx)
-{
-	struct hl_device *hdev = ctx->hdev;
-
-	if (!hdev->mmu_enable)
-		return;
-
-	if (hdev->mmu_func[MMU_DR_PGT].swap_in != NULL)
-		hdev->mmu_func[MMU_DR_PGT].swap_in(ctx);
-
-	if (hdev->mmu_func[MMU_HR_PGT].swap_in != NULL)
-		hdev->mmu_func[MMU_HR_PGT].swap_in(ctx);
-}
-
 static void hl_mmu_pa_page_with_offset(struct hl_ctx *ctx, u64 virt_addr,
 						struct hl_mmu_hop_info *hops,
 						u64 *phys_addr)
@@ -594,7 +554,7 @@ int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
 	int pgt_residency, rc;
 	bool is_dram_addr;
 
-	if (!hdev->mmu_enable)
+	if (hdev->mmu_disable)
 		return -EOPNOTSUPP;
 
 	prop = &hdev->asic_prop;
@@ -625,7 +585,7 @@ int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
 
 int hl_mmu_if_set_funcs(struct hl_device *hdev)
 {
-	if (!hdev->mmu_enable)
+	if (hdev->mmu_disable)
 		return 0;
 
 	switch (hdev->asic_type) {
diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index a1697581c218..056e2ef44afb 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -1469,8 +1469,7 @@ static int gaudi_collective_wait_create_job(struct hl_device *hdev,
 	}
 
 	/* Allocate internal mapped CB for non patched CBs */
-	cb = hl_cb_kernel_create(hdev, cb_size,
-			hdev->mmu_enable && !patched_cb);
+	cb = hl_cb_kernel_create(hdev, cb_size, !patched_cb);
 	if (!cb) {
 		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
 		atomic64_inc(&cntr->out_of_mem_drop_cnt);
@@ -3644,9 +3643,6 @@ static int gaudi_mmu_init(struct hl_device *hdev)
 	u64 hop0_addr;
 	int rc, i;
 
-	if (!hdev->mmu_enable)
-		return 0;
-
 	if (gaudi->hw_cap_initialized & HW_CAP_MMU)
 		return 0;
 
diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c
index fb0ac9df841a..7c685e6075f6 100644
--- a/drivers/accel/habanalabs/goya/goya.c
+++ b/drivers/accel/habanalabs/goya/goya.c
@@ -2671,9 +2671,6 @@ int goya_mmu_init(struct hl_device *hdev)
 	u64 hop0_addr;
 	int rc, i;
 
-	if (!hdev->mmu_enable)
-		return 0;
-
 	if (goya->hw_cap_initialized & HW_CAP_MMU)
 		return 0;
 
diff --git a/drivers/accel/habanalabs/goya/goya_coresight.c b/drivers/accel/habanalabs/goya/goya_coresight.c
index e7ac3046cfaa..a6d6cc38bcd8 100644
--- a/drivers/accel/habanalabs/goya/goya_coresight.c
+++ b/drivers/accel/habanalabs/goya/goya_coresight.c
@@ -371,13 +371,8 @@ static int goya_etr_validate_address(struct hl_device *hdev, u64 addr,
 		return false;
 	}
 
-	if (hdev->mmu_enable) {
-		range_start = prop->dmmu.start_addr;
-		range_end = prop->dmmu.end_addr;
-	} else {
-		range_start = prop->dram_user_base_address;
-		range_end = prop->dram_end_address;
-	}
+	range_start = prop->dmmu.start_addr;
+	range_end = prop->dmmu.end_addr;
 
 	return hl_mem_area_inside_range(addr, size, range_start, range_end);
 }
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 08/12] accel/habanalabs: use binning info when handling razwi
  2023-05-16  9:30 [PATCH 01/12] accel/habanalabs: rename security functions related arguments Oded Gabbay
                   ` (5 preceding siblings ...)
  2023-05-16  9:30 ` [PATCH 07/12] accel/habanalabs: remove support for mmu disable Oded Gabbay
@ 2023-05-16  9:30 ` Oded Gabbay
  2023-05-16  9:30 ` [PATCH 09/12] accel/habanalabs: use lower QM in QM errors handling Oded Gabbay
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 15+ messages in thread
From: Oded Gabbay @ 2023-05-16  9:30 UTC (permalink / raw)
  To: dri-devel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

When receiving sei interrupt from tpc or decoder, we need to check
the binning mask because if the engine is binned, the razwi info
won't be in the router of the binned engine, instead will be in the
router of the substitute engine.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index b8644d87f817..a6aa17d86820 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -8040,7 +8040,7 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 				u8 module_sub_idx, u64 *event_mask)
 {
 	bool via_sft = false;
-	u32 hbw_rtr_id, lbw_rtr_id, dcore_id, dcore_rtr_id, eng_id;
+	u32 hbw_rtr_id, lbw_rtr_id, dcore_id, dcore_rtr_id, eng_id, binned_idx;
 	u64 hbw_rtr_mstr_if_base_addr, lbw_rtr_mstr_if_base_addr;
 	u32 hbw_shrd_aw = 0, hbw_shrd_ar = 0;
 	u32 lbw_shrd_aw = 0, lbw_shrd_ar = 0;
@@ -8048,6 +8048,13 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 
 	switch (module) {
 	case RAZWI_TPC:
+		sprintf(initiator_name, "TPC_%u", module_idx);
+		if (hdev->tpc_binning) {
+			binned_idx = __ffs(hdev->tpc_binning);
+			if (binned_idx == module_idx)
+				module_idx = TPC_ID_DCORE0_TPC6;
+		}
+
 		hbw_rtr_id = gaudi2_tpc_initiator_hbw_rtr_id[module_idx];
 
 		if (hl_is_fw_sw_ver_below(hdev, 1, 9) &&
@@ -8056,7 +8063,6 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 			lbw_rtr_id = DCORE0_RTR0;
 		else
 			lbw_rtr_id = gaudi2_tpc_initiator_lbw_rtr_id[module_idx];
-		sprintf(initiator_name, "TPC_%u", module_idx);
 		break;
 	case RAZWI_MME:
 		sprintf(initiator_name, "MME_%u", module_idx);
@@ -8115,9 +8121,14 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 		sprintf(initiator_name, "NIC_%u", module_idx);
 		break;
 	case RAZWI_DEC:
+		sprintf(initiator_name, "DEC_%u", module_idx);
+		if (hdev->decoder_binning) {
+			binned_idx = __ffs(hdev->decoder_binning);
+			if (binned_idx == module_idx)
+				module_idx = DEC_ID_PCIE_VDEC1;
+		}
 		hbw_rtr_id = gaudi2_dec_initiator_hbw_rtr_id[module_idx];
 		lbw_rtr_id = gaudi2_dec_initiator_lbw_rtr_id[module_idx];
-		sprintf(initiator_name, "DEC_%u", module_idx);
 		break;
 	case RAZWI_ROT:
 		hbw_rtr_id = gaudi2_rot_initiator_hbw_rtr_id[module_idx];
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 09/12] accel/habanalabs: use lower QM in QM errors handling
  2023-05-16  9:30 [PATCH 01/12] accel/habanalabs: rename security functions related arguments Oded Gabbay
                   ` (6 preceding siblings ...)
  2023-05-16  9:30 ` [PATCH 08/12] accel/habanalabs: use binning info when handling razwi Oded Gabbay
@ 2023-05-16  9:30 ` Oded Gabbay
  2023-05-16  9:30 ` [PATCH 10/12] accel/habanalabs: print qman data on error only for lower qman Oded Gabbay
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 15+ messages in thread
From: Oded Gabbay @ 2023-05-16  9:30 UTC (permalink / raw)
  To: dri-devel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

The QMAN GLBL_ERR_STS_4 register has indications for errors also in the
lower CQ and the ARC CQ, and not just for errors in the lower CP.
Modify the relevant define/struct and the related print to use "lower
QM" instead of "lower CP".

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index a6aa17d86820..6e2561ead546 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -57,7 +57,7 @@
 
 #define GAUDI2_NA_EVENT_CAUSE			0xFF
 #define GAUDI2_NUM_OF_QM_ERR_CAUSE		18
-#define GAUDI2_NUM_OF_QM_LCP_ERR_CAUSE		25
+#define GAUDI2_NUM_OF_LOWER_QM_ERR_CAUSE	25
 #define GAUDI2_NUM_OF_QM_ARB_ERR_CAUSE		3
 #define GAUDI2_NUM_OF_ARC_SEI_ERR_CAUSE		14
 #define GAUDI2_NUM_OF_CPU_SEI_ERR_CAUSE		3
@@ -801,7 +801,7 @@ static const char * const gaudi2_qman_error_cause[GAUDI2_NUM_OF_QM_ERR_CAUSE] =
 	"PQC L2H error"
 };
 
-static const char * const gaudi2_qman_lower_cp_error_cause[GAUDI2_NUM_OF_QM_LCP_ERR_CAUSE] = {
+static const char * const gaudi2_lower_qman_error_cause[GAUDI2_NUM_OF_LOWER_QM_ERR_CAUSE] = {
 	"RSVD0",
 	"CQ AXI HBW error",
 	"CP AXI HBW error",
@@ -7895,8 +7895,8 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type
 			continue;
 
 		if (i == QMAN_STREAMS) {
-			snprintf(reg_desc, ARRAY_SIZE(reg_desc), "LowerCP");
-			num_error_causes = GAUDI2_NUM_OF_QM_LCP_ERR_CAUSE;
+			snprintf(reg_desc, ARRAY_SIZE(reg_desc), "LowerQM");
+			num_error_causes = GAUDI2_NUM_OF_LOWER_QM_ERR_CAUSE;
 		} else {
 			snprintf(reg_desc, ARRAY_SIZE(reg_desc), "stream%u", i);
 			num_error_causes = GAUDI2_NUM_OF_QM_ERR_CAUSE;
@@ -7907,7 +7907,7 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type
 				gaudi2_print_event(hdev, event_type, true,
 					"%s. err cause: %s", reg_desc,
 					i == QMAN_STREAMS ?
-					gaudi2_qman_lower_cp_error_cause[j] :
+					gaudi2_lower_qman_error_cause[j] :
 					gaudi2_qman_error_cause[j]);
 				error_count++;
 			}
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 10/12] accel/habanalabs: print qman data on error only for lower qman
  2023-05-16  9:30 [PATCH 01/12] accel/habanalabs: rename security functions related arguments Oded Gabbay
                   ` (7 preceding siblings ...)
  2023-05-16  9:30 ` [PATCH 09/12] accel/habanalabs: use lower QM in QM errors handling Oded Gabbay
@ 2023-05-16  9:30 ` Oded Gabbay
  2023-05-16  9:30 ` [PATCH 11/12] accel/habanalabs: update state when loading boot fit Oded Gabbay
  2023-05-16  9:30 ` [PATCH 12/12] accel/habanalabs: mask part of hmmu page fault captured address Oded Gabbay
  10 siblings, 0 replies; 15+ messages in thread
From: Oded Gabbay @ 2023-05-16  9:30 UTC (permalink / raw)
  To: dri-devel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

By default, the upper QMANs are not used, and instead engines ARCs
access the lower QMANs directly.
Errors for upper QMANs are therefore not expected, and the debug print
of the PQ entries is not needed.

Modify the QMAN debug data print on errors to include only information
for the lower QMAN.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c      | 146 +++---------------
 drivers/accel/habanalabs/gaudi2/gaudi2P.h     |   2 +-
 .../include/gaudi2/asic_reg/gaudi2_regs.h     |  11 ++
 3 files changed, 31 insertions(+), 128 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 6e2561ead546..4981b8eb0ff5 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -7744,137 +7744,28 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
 	return !!ecc_data->is_critical;
 }
 
-/*
- * gaudi2_queue_idx_dec - decrement queue index (pi/ci) and handle wrap
- *
- * @idx: the current pi/ci value
- * @q_len: the queue length (power of 2)
- *
- * @return the cyclically decremented index
- */
-static inline u32 gaudi2_queue_idx_dec(u32 idx, u32 q_len)
-{
-	u32 mask = q_len - 1;
-
-	/*
-	 * modular decrement is equivalent to adding (queue_size -1)
-	 * later we take LSBs to make sure the value is in the
-	 * range [0, queue_len - 1]
-	 */
-	return (idx + q_len - 1) & mask;
-}
-
-/**
- * gaudi2_print_sw_config_stream_data - print SW config stream data
- *
- * @hdev: pointer to the habanalabs device structure
- * @stream: the QMAN's stream
- * @qman_base: base address of QMAN registers block
- */
-static void gaudi2_print_sw_config_stream_data(struct hl_device *hdev,
-						u32 stream, u64 qman_base)
+static void print_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base)
 {
-	u64 cq_ptr_lo, cq_ptr_hi, cq_tsize, cq_ptr;
-	u32 cq_ptr_lo_off, size;
+	u32 lo, hi, cq_ptr_size, arc_cq_ptr_size;
+	u64 cq_ptr, arc_cq_ptr, cp_current_inst;
 
-	cq_ptr_lo_off = mmDCORE0_TPC0_QM_CQ_PTR_LO_1 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0;
-
-	cq_ptr_lo = qman_base + (mmDCORE0_TPC0_QM_CQ_PTR_LO_0 - mmDCORE0_TPC0_QM_BASE) +
-									stream * cq_ptr_lo_off;
-
-	cq_ptr_hi = cq_ptr_lo + (mmDCORE0_TPC0_QM_CQ_PTR_HI_0 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0);
-
-	cq_tsize = cq_ptr_lo + (mmDCORE0_TPC0_QM_CQ_TSIZE_0 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0);
-
-	cq_ptr = (((u64) RREG32(cq_ptr_hi)) << 32) | RREG32(cq_ptr_lo);
-	size = RREG32(cq_tsize);
-	dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %x\n",
-		stream, cq_ptr, size);
-}
-
-/**
- * gaudi2_print_last_pqes_on_err - print last PQEs on error
- *
- * @hdev: pointer to the habanalabs device structure
- * @qid_base: first QID of the QMAN (out of 4 streams)
- * @stream: the QMAN's stream
- * @qman_base: base address of QMAN registers block
- * @pr_sw_conf: if true print the SW config stream data (CQ PTR and SIZE)
- */
-static void gaudi2_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base, u32 stream,
-						u64 qman_base, bool pr_sw_conf)
-{
-	u32 ci, qm_ci_stream_off;
-	struct hl_hw_queue *q;
-	u64 pq_ci;
-	int i;
+	lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET);
+	hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET);
+	cq_ptr = ((u64) hi) << 32 | lo;
+	cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET);
 
-	q = &hdev->kernel_queues[qid_base + stream];
-
-	qm_ci_stream_off = mmDCORE0_TPC0_QM_PQ_CI_1 - mmDCORE0_TPC0_QM_PQ_CI_0;
-	pq_ci = qman_base + (mmDCORE0_TPC0_QM_PQ_CI_0 - mmDCORE0_TPC0_QM_BASE) +
-						stream * qm_ci_stream_off;
-
-	hdev->asic_funcs->hw_queues_lock(hdev);
-
-	if (pr_sw_conf)
-		gaudi2_print_sw_config_stream_data(hdev, stream, qman_base);
-
-	ci = RREG32(pq_ci);
-
-	/* we should start printing form ci -1 */
-	ci = gaudi2_queue_idx_dec(ci, HL_QUEUE_LENGTH);
-
-	for (i = 0; i < PQ_FETCHER_CACHE_SIZE; i++) {
-		struct hl_bd *bd;
-		u64 addr;
-		u32 len;
-
-		bd = q->kernel_address;
-		bd += ci;
-
-		len = le32_to_cpu(bd->len);
-		/* len 0 means uninitialized entry- break */
-		if (!len)
-			break;
-
-		addr = le64_to_cpu(bd->ptr);
-
-		dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %x\n",
-			stream, ci, addr, len);
-
-		/* get previous ci, wrap if needed */
-		ci = gaudi2_queue_idx_dec(ci, HL_QUEUE_LENGTH);
-	}
-
-	hdev->asic_funcs->hw_queues_unlock(hdev);
-}
-
-/**
- * print_qman_data_on_err - extract QMAN data on error
- *
- * @hdev: pointer to the habanalabs device structure
- * @qid_base: first QID of the QMAN (out of 4 streams)
- * @stream: the QMAN's stream
- * @qman_base: base address of QMAN registers block
- *
- * This function attempt to extract as much data as possible on QMAN error.
- * On upper CP print the SW config stream data and last 8 PQEs.
- * On lower CP print SW config data and last PQEs of ALL 4 upper CPs
- */
-static void print_qman_data_on_err(struct hl_device *hdev, u32 qid_base, u32 stream, u64 qman_base)
-{
-	u32 i;
-
-	if (stream != QMAN_STREAMS) {
-		gaudi2_print_last_pqes_on_err(hdev, qid_base, stream, qman_base, true);
-		return;
-	}
+	lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET);
+	hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET);
+	arc_cq_ptr = ((u64) hi) << 32 | lo;
+	arc_cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET);
 
-	gaudi2_print_sw_config_stream_data(hdev, stream, qman_base);
+	lo = RREG32(qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET);
+	hi = RREG32(qman_base + QM_CP_CURRENT_INST_HI_4_OFFSET);
+	cp_current_inst = ((u64) hi) << 32 | lo;
 
-	for (i = 0 ; i < QMAN_STREAMS ; i++)
-		gaudi2_print_last_pqes_on_err(hdev, qid_base, i, qman_base, false);
+	dev_info(hdev->dev,
+		"LowerQM. CQ: {ptr %#llx, size %u}, ARC_CQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n",
+		cq_ptr, cq_ptr_size, arc_cq_ptr, arc_cq_ptr_size, cp_current_inst);
 }
 
 static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type,
@@ -7912,7 +7803,8 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type
 				error_count++;
 			}
 
-		print_qman_data_on_err(hdev, qid_base, i, qman_base);
+		if (i == QMAN_STREAMS)
+			print_lower_qman_data_on_err(hdev, qman_base);
 	}
 
 	arb_err_val = RREG32(arb_err_addr);
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2P.h b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
index 1cebe707772e..5f3ce086928e 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2P.h
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
@@ -98,7 +98,7 @@
 #define GAUDI2_DEFAULT_CARD_NAME		"HL225"
 
 #define QMAN_STREAMS				4
-#define PQ_FETCHER_CACHE_SIZE			8
+
 #define NUM_OF_MME_SBTE_PORTS			5
 #define NUM_OF_MME_WB_PORTS			2
 
diff --git a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
index 6c58af614236..a08378d0802b 100644
--- a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
+++ b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
@@ -242,6 +242,17 @@
 #define QM_FENCE2_OFFSET		(mmPDMA0_QM_CP_FENCE2_RDATA_0 - mmPDMA0_QM_BASE)
 #define QM_SEI_STATUS_OFFSET		(mmPDMA0_QM_SEI_STATUS - mmPDMA0_QM_BASE)
 
+#define QM_CQ_PTR_LO_4_OFFSET		(mmPDMA0_QM_CQ_PTR_LO_4 - mmPDMA0_QM_BASE)
+#define QM_CQ_PTR_HI_4_OFFSET		(mmPDMA0_QM_CQ_PTR_HI_4 - mmPDMA0_QM_BASE)
+#define QM_CQ_TSIZE_4_OFFSET		(mmPDMA0_QM_CQ_TSIZE_4 - mmPDMA0_QM_BASE)
+
+#define QM_ARC_CQ_PTR_LO_OFFSET		(mmPDMA0_QM_ARC_CQ_PTR_LO - mmPDMA0_QM_BASE)
+#define QM_ARC_CQ_PTR_HI_OFFSET		(mmPDMA0_QM_ARC_CQ_PTR_HI - mmPDMA0_QM_BASE)
+#define QM_ARC_CQ_TSIZE_OFFSET		(mmPDMA0_QM_ARC_CQ_TSIZE - mmPDMA0_QM_BASE)
+
+#define QM_CP_CURRENT_INST_LO_4_OFFSET	(mmPDMA0_QM_CP_CURRENT_INST_LO_4 - mmPDMA0_QM_BASE)
+#define QM_CP_CURRENT_INST_HI_4_OFFSET	(mmPDMA0_QM_CP_CURRENT_INST_HI_4 - mmPDMA0_QM_BASE)
+
 #define SFT_OFFSET		(mmSFT1_HBW_RTR_IF0_RTR_H3_BASE - mmSFT0_HBW_RTR_IF0_RTR_H3_BASE)
 #define SFT_IF_RTR_OFFSET	(mmSFT0_HBW_RTR_IF1_RTR_H3_BASE - mmSFT0_HBW_RTR_IF0_RTR_H3_BASE)
 
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 11/12] accel/habanalabs: update state when loading boot fit
  2023-05-16  9:30 [PATCH 01/12] accel/habanalabs: rename security functions related arguments Oded Gabbay
                   ` (8 preceding siblings ...)
  2023-05-16  9:30 ` [PATCH 10/12] accel/habanalabs: print qman data on error only for lower qman Oded Gabbay
@ 2023-05-16  9:30 ` Oded Gabbay
  2023-05-16  9:30 ` [PATCH 12/12] accel/habanalabs: mask part of hmmu page fault captured address Oded Gabbay
  10 siblings, 0 replies; 15+ messages in thread
From: Oded Gabbay @ 2023-05-16  9:30 UTC (permalink / raw)
  To: dri-devel; +Cc: Koby Elbaz

From: Koby Elbaz <kelbaz@habana.ai>

Any FW component we load must be followed by a corresponding state
update. However, it seems that so far we skipped doing so for the
bootfit case, so fix that.

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/firmware_if.c | 25 ++++++++-----------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index eb51d7f70aec..acbc1a6b5cb1 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -2486,16 +2486,6 @@ static int hl_fw_dynamic_load_image(struct hl_device *hdev,
 	if (rc)
 		goto release_fw;
 
-	/* update state according to boot stage */
-	if (cur_fwc == FW_COMP_BOOT_FIT) {
-		struct cpu_dyn_regs *dyn_regs;
-
-		dyn_regs = &fw_loader->dynamic_loader.comm_desc.cpu_dyn_regs;
-		hl_fw_boot_fit_update_state(hdev,
-				le32_to_cpu(dyn_regs->cpu_boot_dev_sts0),
-				le32_to_cpu(dyn_regs->cpu_boot_dev_sts1));
-	}
-
 	/* copy boot fit to space allocated by FW */
 	rc = hl_fw_dynamic_copy_image(hdev, fw, fw_loader);
 	if (rc)
@@ -2798,6 +2788,14 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 		goto protocol_err;
 	}
 
+	rc = hl_fw_dynamic_wait_for_boot_fit_active(hdev, fw_loader);
+	if (rc)
+		goto protocol_err;
+
+	hl_fw_boot_fit_update_state(hdev,
+			le32_to_cpu(dyn_regs->cpu_boot_dev_sts0),
+			le32_to_cpu(dyn_regs->cpu_boot_dev_sts1));
+
 	/*
 	 * when testing FW load (without Linux) on PLDM we don't want to
 	 * wait until boot fit is active as it may take several hours.
@@ -2807,10 +2805,6 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 	if (hdev->pldm && !(hdev->fw_components & FW_TYPE_LINUX))
 		return 0;
 
-	rc = hl_fw_dynamic_wait_for_boot_fit_active(hdev, fw_loader);
-	if (rc)
-		goto protocol_err;
-
 	/* Enable DRAM scrambling before Linux boot and after successful
 	 *  UBoot
 	 */
@@ -2844,7 +2838,8 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 	if (rc)
 		goto protocol_err;
 
-	hl_fw_linux_update_state(hdev, le32_to_cpu(dyn_regs->cpu_boot_dev_sts0),
+	hl_fw_linux_update_state(hdev,
+				le32_to_cpu(dyn_regs->cpu_boot_dev_sts0),
 				le32_to_cpu(dyn_regs->cpu_boot_dev_sts1));
 
 	hl_fw_dynamic_update_linux_interrupt_if(hdev);
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 12/12] accel/habanalabs: mask part of hmmu page fault captured address
  2023-05-16  9:30 [PATCH 01/12] accel/habanalabs: rename security functions related arguments Oded Gabbay
                   ` (9 preceding siblings ...)
  2023-05-16  9:30 ` [PATCH 11/12] accel/habanalabs: update state when loading boot fit Oded Gabbay
@ 2023-05-16  9:30 ` Oded Gabbay
  10 siblings, 0 replies; 15+ messages in thread
From: Oded Gabbay @ 2023-05-16  9:30 UTC (permalink / raw)
  To: dri-devel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

When receiving page fault from hmmu, the captured address is scrambled
both by HW and by driver. The driver part is unscrambled but the HW
part isn't getting unscrambled.
To avoid declaring wrong address, the HW scrambled part will be
masked.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 4981b8eb0ff5..1cb2b72e1cd2 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -162,6 +162,9 @@
 #define PSOC_RAZWI_ENG_STR_SIZE 128
 #define PSOC_RAZWI_MAX_ENG_PER_RTR 5
 
+/* HW scrambles only bits 0-25 */
+#define HW_UNSCRAMBLED_BITS_MASK GENMASK_ULL(63, 26)
+
 struct gaudi2_razwi_info {
 	u32 axuser_xy;
 	u32 rtr_ctrl;
@@ -8835,11 +8838,16 @@ static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool
 	addr <<= 32;
 	addr |= RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_ERROR_CAPTURE_VA));
 
-	if (!is_pmmu)
+	if (is_pmmu) {
+		dev_err_ratelimited(hdev->dev, "PMMU page fault on va 0x%llx\n", addr);
+	} else {
+
 		addr = gaudi2_mmu_descramble_addr(hdev, addr);
+		addr &= HW_UNSCRAMBLED_BITS_MASK;
+		dev_err_ratelimited(hdev->dev, "HMMU page fault on va range 0x%llx - 0x%llx\n",
+				addr, addr + ~HW_UNSCRAMBLED_BITS_MASK);
+	}
 
-	dev_err_ratelimited(hdev->dev, "%s page fault on va 0x%llx\n",
-				is_pmmu ? "PMMU" : "HMMU", addr);
 	hl_handle_page_fault(hdev, addr, 0, is_pmmu, event_mask);
 
 	WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID), 0);
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 05/12] accel/habanalabs: print max timeout value on CS stuck
  2023-05-16  9:30 ` [PATCH 05/12] accel/habanalabs: print max timeout value on CS stuck Oded Gabbay
@ 2023-05-17 18:01   ` Ofir Bitton
  0 siblings, 0 replies; 15+ messages in thread
From: Ofir Bitton @ 2023-05-17 18:01 UTC (permalink / raw)
  To: Oded Gabbay, dri-devel

On 16/05/2023 12:30, Oded Gabbay wrote:
> If a workload got stuck, we print an error to the kernel log about it.
> Add to that print the configured max timeout value, as that value is
> not fixed between ASICs and in addition it can be configured using
> a kernel module parameter.
>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
> ---
>   .../habanalabs/common/command_submission.c    | 26 +++++++++++--------
>   1 file changed, 15 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c
> index ccf68f482948..4ec28af3ed78 100644
> --- a/drivers/accel/habanalabs/common/command_submission.c
> +++ b/drivers/accel/habanalabs/common/command_submission.c
> @@ -804,12 +804,14 @@ static void cs_do_release(struct kref *ref)
>   
>   static void cs_timedout(struct work_struct *work)
>   {
> +	struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work);
> +	bool skip_reset_on_timeout, device_reset = false;
>   	struct hl_device *hdev;
>   	u64 event_mask = 0x0;
> +	uint timeout_sec;
>   	int rc;
> -	struct hl_cs *cs = container_of(work, struct hl_cs,
> -						 work_tdr.work);
> -	bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false;
> +
> +	skip_reset_on_timeout = cs->skip_reset_on_timeout;
>   
>   	rc = cs_get_unless_zero(cs);
>   	if (!rc)
> @@ -840,29 +842,31 @@ static void cs_timedout(struct work_struct *work)
>   		event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT;
>   	}
>   
> +	timeout_sec = jiffies_to_msecs(hdev->timeout_jiffies) / 1000;
> +
>   	switch (cs->type) {
>   	case CS_TYPE_SIGNAL:
>   		dev_err(hdev->dev,
> -			"Signal command submission %llu has not finished in time!\n",
> -			cs->sequence);
> +			"Signal command submission %llu has not finished in %u seconds!\n",
> +			cs->sequence, timeout_sec);
>   		break;
>   
>   	case CS_TYPE_WAIT:
>   		dev_err(hdev->dev,
> -			"Wait command submission %llu has not finished in time!\n",
> -			cs->sequence);
> +			"Wait command submission %llu has not finished in %u seconds!\n",
> +			cs->sequence, timeout_sec);
>   		break;
>   
>   	case CS_TYPE_COLLECTIVE_WAIT:
>   		dev_err(hdev->dev,
> -			"Collective Wait command submission %llu has not finished in time!\n",
> -			cs->sequence);
> +			"Collective Wait command submission %llu has not finished in %u seconds!\n",
> +			cs->sequence, timeout_sec);
>   		break;
>   
>   	default:
>   		dev_err(hdev->dev,
> -			"Command submission %llu has not finished in time!\n",
> -			cs->sequence);
> +			"Command submission %llu has not finished in %u seconds!\n",
> +			cs->sequence, timeout_sec);
>   		break;
>   	}
>   

Reviewed-by: Ofir Bitton<obitton@habana.ai>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 04/12] accel/habanalabs: align to latest firmware specs
  2023-05-16  9:30 ` [PATCH 04/12] accel/habanalabs: align to latest firmware specs Oded Gabbay
@ 2023-05-17 18:03   ` Ofir Bitton
  0 siblings, 0 replies; 15+ messages in thread
From: Ofir Bitton @ 2023-05-17 18:03 UTC (permalink / raw)
  To: Oded Gabbay, dri-devel

On 16/05/2023 12:30, Oded Gabbay wrote:
> Update the firmware common interface files with the latest version.
>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
> ---
>   .../habanalabs/include/common/cpucp_if.h      | 18 ++++----
>   .../habanalabs/include/common/hl_boot_if.h    | 41 ++++---------------
>   2 files changed, 16 insertions(+), 43 deletions(-)
>
> diff --git a/drivers/accel/habanalabs/include/common/cpucp_if.h b/drivers/accel/habanalabs/include/common/cpucp_if.h
> index f68308cc2524..33807b839c37 100644
> --- a/drivers/accel/habanalabs/include/common/cpucp_if.h
> +++ b/drivers/accel/habanalabs/include/common/cpucp_if.h
> @@ -359,7 +359,7 @@ struct hl_eq_entry {
>   	union {
>   		__le64 data_placeholder;
>   		struct hl_eq_ecc_data ecc_data;
> -		struct hl_eq_hbm_ecc_data hbm_ecc_data;	/* Gaudi1 HBM */
> +		struct hl_eq_hbm_ecc_data hbm_ecc_data;	/* Obsolete */
>   		struct hl_eq_sm_sei_data sm_sei_data;
>   		struct cpucp_pkt_sync_err pkt_sync_err;
>   		struct hl_eq_fw_alive fw_alive;
> @@ -653,7 +653,7 @@ enum pq_init_status {
>    *       which address is passed via the CpuCp packet. In addition, the host's driver
>    *       passes the max size it allows the CpuCP to write to the structure, to prevent
>    *       data corruption in case of mismatched driver/FW versions.
> - *       Relevant only to Gaudi.
> + *       Obsolete.
>    *
>    * CPUCP_PACKET_GENERIC_PASSTHROUGH -
>    *      Generic opcode for all firmware info that is only passed to host
> @@ -868,19 +868,19 @@ struct cpucp_array_data_packet {
>   enum cpucp_led_index {
>   	CPUCP_LED0_INDEX = 0,
>   	CPUCP_LED1_INDEX,
> -	CPUCP_LED2_INDEX
> +	CPUCP_LED2_INDEX,
> +	CPUCP_LED_MAX_INDEX = CPUCP_LED2_INDEX
>   };
>   
>   /*
>    * enum cpucp_packet_rc - Error return code
>    * @cpucp_packet_success	-> in case of success.
> - * @cpucp_packet_invalid	-> this is to support Goya and Gaudi platform.
> + * @cpucp_packet_invalid	-> this is to support first generation platforms.
>    * @cpucp_packet_fault		-> in case of processing error like failing to
>    *                                 get device binding or semaphore etc.
> - * @cpucp_packet_invalid_pkt	-> when cpucp packet is un-supported. This is
> - *                                 supported Greco onwards.
> + * @cpucp_packet_invalid_pkt	-> when cpucp packet is un-supported.
>    * @cpucp_packet_invalid_params	-> when checking parameter like length of buffer
> - *				   or attribute value etc. Supported Greco onwards.
> + *				   or attribute value etc.
>    * @cpucp_packet_rc_max		-> It indicates size of enum so should be at last.
>    */
>   enum cpucp_packet_rc {
> @@ -1365,7 +1365,7 @@ struct cpucp_dev_info_signed {
>   #define DCORE_MON_REGS_SZ	512
>   /*
>    * struct dcore_monitor_regs_data - DCORE monitor regs data.
> - * the structure follows sync manager block layout. relevant only to Gaudi.
> + * the structure follows sync manager block layout. Obsolete.
>    * @mon_pay_addrl: array of payload address low bits.
>    * @mon_pay_addrh: array of payload address high bits.
>    * @mon_pay_data: array of payload data.
> @@ -1380,7 +1380,7 @@ struct dcore_monitor_regs_data {
>   	__le32 mon_status[DCORE_MON_REGS_SZ];
>   };
>   
> -/* contains SM data for each SYNC_MNGR (relevant only to Gaudi) */
> +/* contains SM data for each SYNC_MNGR (Obsolete) */
>   struct cpucp_monitor_dump {
>   	struct dcore_monitor_regs_data sync_mngr_w_s;
>   	struct dcore_monitor_regs_data sync_mngr_e_s;
> diff --git a/drivers/accel/habanalabs/include/common/hl_boot_if.h b/drivers/accel/habanalabs/include/common/hl_boot_if.h
> index c58d76a2705c..cff79f7f9f75 100644
> --- a/drivers/accel/habanalabs/include/common/hl_boot_if.h
> +++ b/drivers/accel/habanalabs/include/common/hl_boot_if.h
> @@ -35,6 +35,7 @@ enum cpu_boot_err {
>   	CPU_BOOT_ERR_TPM_FAIL = 20,
>   	CPU_BOOT_ERR_TMP_THRESH_INIT_FAIL = 21,
>   	CPU_BOOT_ERR_EEPROM_FAIL = 22,
> +	CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL = 23,
>   	CPU_BOOT_ERR_ENABLED = 31,
>   	CPU_BOOT_ERR_SCND_EN = 63,
>   	CPU_BOOT_ERR_LAST = 64 /* we have 2 registers of 32 bits */
> @@ -51,6 +52,7 @@ enum cpu_boot_err {
>   		 (1 << CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL) |	\
>   		 (1 << CPU_BOOT_ERR_BINNING_FAIL) |		\
>   		 (1 << CPU_BOOT_ERR_DRAM_SKIPPED) |		\
> +		 (1 << CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL) |	\
>   		 (1 << CPU_BOOT_ERR_EEPROM_FAIL))
>   
>   /*
> @@ -132,6 +134,9 @@ enum cpu_boot_err {
>    * CPU_BOOT_ERR_EEPROM_FAIL		Failed reading EEPROM data. Defaults
>    *					are used.
>    *
> + * CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL	Failed scrubbing the Engines/ARCFarm
> + *					memories. Boot disabled until reset.
> + *
>    * CPU_BOOT_ERR0_ENABLED		Error registers enabled.
>    *					This is a main indication that the
>    *					running FW populates the error
> @@ -157,6 +162,7 @@ enum cpu_boot_err {
>   #define CPU_BOOT_ERR0_TPM_FAIL			(1 << CPU_BOOT_ERR_TPM_FAIL)
>   #define CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL	(1 << CPU_BOOT_ERR_TMP_THRESH_INIT_FAIL)
>   #define CPU_BOOT_ERR0_EEPROM_FAIL		(1 << CPU_BOOT_ERR_EEPROM_FAIL)
> +#define CPU_BOOT_ERR0_ENG_ARC_MEM_SCRUB_FAIL	(1 << CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL)
>   #define CPU_BOOT_ERR0_ENABLED			(1 << CPU_BOOT_ERR_ENABLED)
>   #define CPU_BOOT_ERR1_ENABLED			(1 << CPU_BOOT_ERR_ENABLED)
>   
> @@ -744,36 +750,6 @@ struct comms_status {
>   	};
>   };
>   
> -/**
> - * HL_MODULES_MAX_NUM is determined by the size of modules_mask in struct
> - *      hl_component_versions
> - */
> -enum hl_modules {
> -	HL_MODULES_BOOT_INFO = 0,
> -	HL_MODULES_EEPROM,
> -	HL_MODULES_FDT,
> -	HL_MODULES_I2C,
> -	HL_MODULES_LZ4,
> -	HL_MODULES_MBEDTLS,
> -	HL_MODULES_MAX_NUM = 16
> -};
> -
> -/**
> - * HL_COMPONENTS_MAX_NUM is determined by the size of components_mask in
> - *      struct cpucp_versions
> - */
> -enum hl_components {
> -	HL_COMPONENTS_PID = 0,
> -	HL_COMPONENTS_MGMT,
> -	HL_COMPONENTS_PREBOOT,
> -	HL_COMPONENTS_PPBOOT,
> -	HL_COMPONENTS_ARMCP,
> -	HL_COMPONENTS_CPLD,
> -	HL_COMPONENTS_UBOOT,
> -	HL_COMPONENTS_FUSE,
> -	HL_COMPONENTS_MAX_NUM = 16
> -};
> -
>   #define NAME_MAX_LEN	32 /* bytes */
>   struct hl_module_data {
>   	__u8 name[NAME_MAX_LEN];
> @@ -787,8 +763,6 @@ struct hl_module_data {
>    * @component: version of the component itself.
>    * @fw_os: Firmware OS Version.
>    * @comp_name: Name of the component.
> - * @modules_mask: i'th bit (from LSB) is a flag - on if module i in enum
> - *              hl_modules is used.
>    * @modules_counter: number of set bits in modules_mask.
>    * @reserved: reserved for future use.
>    * @modules: versions of the component's modules. Elborated explanation in
> @@ -800,9 +774,8 @@ struct hl_component_versions {
>   	__u8 component[VERSION_MAX_LEN];
>   	__u8 fw_os[VERSION_MAX_LEN];
>   	__u8 comp_name[NAME_MAX_LEN];
> -	__le16 modules_mask;
>   	__u8 modules_counter;
> -	__u8 reserved[1];
> +	__u8 reserved[3];
>   	struct hl_module_data modules[];
>   };
>   

Reviewed-by: Ofir Bitton <obitton@habana.ai>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 02/12] accel/habanalabs: set unused bit as reserved
  2023-05-16  9:30 ` [PATCH 02/12] accel/habanalabs: set unused bit as reserved Oded Gabbay
@ 2023-05-17 18:03   ` Ofir Bitton
  0 siblings, 0 replies; 15+ messages in thread
From: Ofir Bitton @ 2023-05-17 18:03 UTC (permalink / raw)
  To: Oded Gabbay, dri-devel

On 16/05/2023 12:30, Oded Gabbay wrote:
> Get latest f/w gaudi2 interface file which marks unused
> bist_need_iatu_config bit in cold_rst_data structure as reserved bit.
>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
> ---
>   drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h b/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h
> index 8522f24deac0..18ca147b1c86 100644
> --- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h
> +++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h
> @@ -62,7 +62,7 @@ struct gaudi2_cold_rst_data {
>   			u32 fake_security_enable : 1;
>   			u32 fake_sig_validation_en : 1;
>   			u32 bist_skip_enable : 1;
> -			u32 bist_need_iatu_config : 1;
> +			u32 reserved1 : 1;
>   			u32 fake_bis_compliant : 1;
>   			u32 wd_rst_cause_arm : 1;
>   			u32 wd_rst_cause_arcpid : 1;

Reviewed-by: Ofir Bitton <obitton@habana.ai>


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2023-05-17 18:04 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-16  9:30 [PATCH 01/12] accel/habanalabs: rename security functions related arguments Oded Gabbay
2023-05-16  9:30 ` [PATCH 02/12] accel/habanalabs: set unused bit as reserved Oded Gabbay
2023-05-17 18:03   ` Ofir Bitton
2023-05-16  9:30 ` [PATCH 03/12] accel/habanalabs: fix mem leak in capture user mappings Oded Gabbay
2023-05-16  9:30 ` [PATCH 04/12] accel/habanalabs: align to latest firmware specs Oded Gabbay
2023-05-17 18:03   ` Ofir Bitton
2023-05-16  9:30 ` [PATCH 05/12] accel/habanalabs: print max timeout value on CS stuck Oded Gabbay
2023-05-17 18:01   ` Ofir Bitton
2023-05-16  9:30 ` [PATCH 06/12] accel/habanalabs: upon DMA errors, use FW-extracted error cause Oded Gabbay
2023-05-16  9:30 ` [PATCH 07/12] accel/habanalabs: remove support for mmu disable Oded Gabbay
2023-05-16  9:30 ` [PATCH 08/12] accel/habanalabs: use binning info when handling razwi Oded Gabbay
2023-05-16  9:30 ` [PATCH 09/12] accel/habanalabs: use lower QM in QM errors handling Oded Gabbay
2023-05-16  9:30 ` [PATCH 10/12] accel/habanalabs: print qman data on error only for lower qman Oded Gabbay
2023-05-16  9:30 ` [PATCH 11/12] accel/habanalabs: update state when loading boot fit Oded Gabbay
2023-05-16  9:30 ` [PATCH 12/12] accel/habanalabs: mask part of hmmu page fault captured address Oded Gabbay

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.