linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 01/13] habanalabs: use lower_32_bits()
@ 2022-10-06  8:22 Oded Gabbay
  2022-10-06  8:22 ` [PATCH 02/13] habanalabs/gaudi2: fix module ID for RAZWI handling Oded Gabbay
                   ` (11 more replies)
  0 siblings, 12 replies; 13+ messages in thread
From: Oded Gabbay @ 2022-10-06  8:22 UTC (permalink / raw)
  To: linux-kernel; +Cc: Bharat Jauhari

From: Bharat Jauhari <bjauhari@habana.ai>

This fixes sparse warning on doing cast to 32-bits

Signed-off-by: Bharat Jauhari <bjauhari@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index cdc50c2c4de8..f4b3fa4b0976 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2528,7 +2528,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
 				break; \
 			(val) = __elbi_read; \
 		} else {\
-			(val) = RREG32((u32)(addr)); \
+			(val) = RREG32(lower_32_bits(addr)); \
 		} \
 		if (cond) \
 			break; \
@@ -2539,7 +2539,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
 					break; \
 				(val) = __elbi_read; \
 			} else {\
-				(val) = RREG32((u32)(addr)); \
+				(val) = RREG32(lower_32_bits(addr)); \
 			} \
 			break; \
 		} \
@@ -2594,7 +2594,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
 				if (__rc) \
 					break; \
 			} else { \
-				__read_val = RREG32((u32)(addr_arr)[__arr_idx]); \
+				__read_val = RREG32(lower_32_bits(addr_arr[__arr_idx])); \
 			} \
 			if (__read_val == (expected_val))	\
 				__elem_bitmask &= ~BIT_ULL(__arr_idx);	\
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 02/13] habanalabs/gaudi2: fix module ID for RAZWI handling
  2022-10-06  8:22 [PATCH 01/13] habanalabs: use lower_32_bits() Oded Gabbay
@ 2022-10-06  8:22 ` Oded Gabbay
  2022-10-06  8:22 ` [PATCH 03/13] habanalabs: add page fault info uapi Oded Gabbay
                   ` (10 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Oded Gabbay @ 2022-10-06  8:22 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

RAZWI is optionally handled as part of the generic QM SEI error
handling, but it always uses PDMA as the module ID.
Fix it to use the suitable module ID according to the specific event.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 75c4bef7841c..a7eccc41d508 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -7602,6 +7602,7 @@ static void _gaudi2_handle_qm_sei_err(struct hl_device *hdev, u64 qman_base)
 static void gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type,
 					struct hl_eq_razwi_info *razwi_info)
 {
+	enum razwi_event_sources module;
 	u64 qman_base;
 	u8 index;
 
@@ -7611,9 +7612,11 @@ static void gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type,
 		qman_base = mmDCORE0_TPC0_QM_BASE +
 				(index / NUM_OF_TPC_PER_DCORE) * DCORE_OFFSET +
 				(index % NUM_OF_TPC_PER_DCORE) * DCORE_TPC_OFFSET;
+		module = RAZWI_TPC;
 		break;
 	case GAUDI2_EVENT_TPC24_AXI_ERR_RSP:
 		qman_base = mmDCORE0_TPC6_QM_BASE;
+		module = RAZWI_TPC;
 		break;
 	case GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE:
 	case GAUDI2_EVENT_MME1_CTRL_AXI_ERROR_RESPONSE:
@@ -7623,16 +7626,19 @@ static void gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type,
 				(GAUDI2_EVENT_MME1_CTRL_AXI_ERROR_RESPONSE -
 						GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE);
 		qman_base = mmDCORE0_MME_QM_BASE + index * DCORE_OFFSET;
+		module = RAZWI_MME;
 		break;
 	case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP:
 	case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP:
 		index = event_type - GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP;
 		qman_base = mmPDMA0_QM_BASE + index * PDMA_OFFSET;
+		module = RAZWI_PDMA;
 		break;
 	case GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE:
 	case GAUDI2_EVENT_ROTATOR1_AXI_ERROR_RESPONSE:
 		index = event_type - GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE;
 		qman_base = mmROT0_QM_BASE + index * ROT_OFFSET;
+		module = RAZWI_ROT;
 		break;
 	default:
 		return;
@@ -7647,7 +7653,7 @@ static void gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type,
 
 	/* check if RAZWI happened */
 	if (razwi_info)
-		gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_PDMA, 0, 0, razwi_info);
+		gaudi2_ack_module_razwi_event_handler(hdev, module, 0, 0, razwi_info);
 }
 
 static void gaudi2_handle_qman_err(struct hl_device *hdev, u16 event_type)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 03/13] habanalabs: add page fault info uapi
  2022-10-06  8:22 [PATCH 01/13] habanalabs: use lower_32_bits() Oded Gabbay
  2022-10-06  8:22 ` [PATCH 02/13] habanalabs/gaudi2: fix module ID for RAZWI handling Oded Gabbay
@ 2022-10-06  8:22 ` Oded Gabbay
  2022-10-06  8:22 ` [PATCH 04/13] habanalabs: replace 'pf' to 'prefetch' Oded Gabbay
                   ` (9 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Oded Gabbay @ 2022-10-06  8:22 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

Only the first page fault will be saved.
Besides the address which caused the page fault, the driver captures
all of the mmu user mappings.
User can retrieve this data via the new uapi (new opcode in INFO ioctl).

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c       | 58 +++++++++++++++++++
 drivers/misc/habanalabs/common/habanalabs.h   | 22 ++++++-
 .../misc/habanalabs/common/habanalabs_drv.c   |  1 +
 .../misc/habanalabs/common/habanalabs_ioctl.c | 42 ++++++++++++++
 drivers/misc/habanalabs/gaudi/gaudi.c         |  2 +
 include/uapi/misc/habanalabs.h                | 31 ++++++++++
 6 files changed, 155 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 30ddaaae67e5..5dc6c77b4721 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -12,6 +12,7 @@
 
 #include <linux/pci.h>
 #include <linux/hwmon.h>
+#include <linux/vmalloc.h>
 
 #include <trace/events/habanalabs.h>
 
@@ -2199,6 +2200,8 @@ void hl_device_fini(struct hl_device *hdev)
 
 	hl_mmu_fini(hdev);
 
+	vfree(hdev->captured_err_info.pgf_info.user_mappings);
+
 	hl_eq_fini(hdev, &hdev->event_queue);
 
 	kfree(hdev->shadow_cs_queue);
@@ -2275,3 +2278,58 @@ void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_
 			num_of_engines * sizeof(u16));
 	hdev->captured_err_info.razwi.flags = flags;
 }
+static void hl_capture_user_mappings(struct hl_device *hdev)
+{
+	struct page_fault_info *pgf_info = &hdev->captured_err_info.pgf_info;
+	struct hl_vm_hash_node *hnode;
+	struct hl_userptr *userptr;
+	struct hl_ctx *ctx;
+	u32 map_idx = 0;
+	int i;
+
+	ctx = hl_get_compute_ctx(hdev);
+	if (!ctx) {
+		dev_err(hdev->dev, "Can't get user context for user mappings\n");
+		return;
+	}
+
+	mutex_lock(&ctx->mem_hash_lock);
+	hash_for_each(ctx->mem_hash, i, hnode, node)
+	pgf_info->num_of_user_mappings++;
+
+	if (!pgf_info->num_of_user_mappings)
+		goto finish;
+
+	/* In case we already allocated in previous session, need to release it before
+	 * allocating new buffer.
+	 */
+	vfree(pgf_info->user_mappings);
+	pgf_info->user_mappings =
+			vmalloc(pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping));
+	if (!pgf_info->user_mappings) {
+		pgf_info->num_of_user_mappings = 0;
+		goto finish;
+	}
+
+	hash_for_each(ctx->mem_hash, i, hnode, node) {
+		userptr = hnode->ptr;
+		pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr;
+		pgf_info->user_mappings[map_idx].size = userptr->size;
+		map_idx++;
+	}
+finish:
+	mutex_unlock(&ctx->mem_hash_lock);
+	hl_ctx_put(ctx);
+}
+
+void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu)
+{
+	/* Capture only the first page fault */
+	if (atomic_cmpxchg(&hdev->captured_err_info.pgf_info_recorded, 0, 1))
+		return;
+
+	hdev->captured_err_info.pgf_info.pgf.timestamp = ktime_to_ns(ktime_get());
+	hdev->captured_err_info.pgf_info.pgf.addr = addr;
+	hdev->captured_err_info.pgf_info.pgf.engine_id = eng_id;
+	hl_capture_user_mappings(hdev);
+}
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index f4b3fa4b0976..1489240d5a3a 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2957,19 +2957,38 @@ struct undefined_opcode_info {
 	bool write_enable;
 };
 
+/**
+ * struct page_fault_info - info about page fault
+ * @pgf_info: page fault information.
+ * @user_mappings: buffer containing user mappings.
+ * @num_of_user_mappings: number of user mappings.
+ */
+struct page_fault_info {
+	struct hl_page_fault_info	pgf;
+	struct hl_user_mapping		*user_mappings;
+	u64				num_of_user_mappings;
+};
+
 /**
  * struct hl_error_info - holds information collected during an error.
  * @cs_timeout: CS timeout error information.
  * @razwi: razwi information.
  * @razwi_info_recorded: if set writing to razwi information is enabled.
- *                otherwise - disabled, so the first (root cause) razwi will not be overwritten.
+ *                       otherwise - disabled, so the first (root cause) razwi will not be
+ *                       overwritten.
  * @undef_opcode: undefined opcode information
+ * @pgf_info: page fault information.
+ * @pgf_info_recorded: if set writing to page fault information is enabled.
+ *                     otherwise - disabled, so the first (root cause) page fault will not be
+ *                     overwritten.
  */
 struct hl_error_info {
 	struct cs_timeout_info		cs_timeout;
 	struct hl_info_razwi_event	razwi;
 	atomic_t			razwi_info_recorded;
 	struct undefined_opcode_info	undef_opcode;
+	struct page_fault_info		pgf_info;
+	atomic_t			pgf_info_recorded;
 };
 
 /**
@@ -3781,6 +3800,7 @@ hl_mmap_mem_buf_alloc(struct hl_mem_mgr *mmg,
 __printf(2, 3) void hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...);
 void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
 			u8 flags);
+void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu);
 
 #ifdef CONFIG_DEBUG_FS
 
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index d87434b9bc16..714994725224 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -213,6 +213,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
 
 	atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1);
 	atomic_set(&hdev->captured_err_info.razwi_info_recorded, 0);
+	atomic_set(&hdev->captured_err_info.pgf_info_recorded, 0);
 	hdev->captured_err_info.undef_opcode.write_enable = true;
 
 	hdev->open_counter++;
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index 6aef4e24d122..cac2c7fb14f1 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -778,6 +778,42 @@ static int engine_status_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 	return rc;
 }
 
+static int page_fault_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	u32 max_size = args->return_size;
+	struct hl_page_fault_info *info = &hdev->captured_err_info.pgf_info.pgf;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	return copy_to_user(out, info, min_t(size_t, max_size, sizeof(struct hl_page_fault_info)))
+				? -EFAULT : 0;
+}
+
+static int user_mappings_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	u32 user_buf_size = args->return_size;
+	struct hl_device *hdev = hpriv->hdev;
+	struct page_fault_info *pgf_info;
+	u64 actual_size;
+
+	pgf_info = &hdev->captured_err_info.pgf_info;
+	args->array_size = pgf_info->num_of_user_mappings;
+
+	if (!out)
+		return -EINVAL;
+
+	actual_size = pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping);
+	if (user_buf_size < actual_size)
+		return -ENOMEM;
+
+	return copy_to_user(out, pgf_info->user_mappings, min_t(size_t, user_buf_size, actual_size))
+				? -EFAULT : 0;
+}
+
 static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 				struct device *dev)
 {
@@ -837,6 +873,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_GET_EVENTS:
 		return events_info(hpriv, args);
 
+	case HL_INFO_PAGE_FAULT_EVENT:
+		return page_fault_info(hpriv, args);
+
+	case HL_INFO_USER_MAPPINGS:
+		return user_mappings_info(hpriv, args);
+
 	default:
 		break;
 	}
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index f856ac51fde1..1a99f7be8b60 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -6755,6 +6755,8 @@ static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr
 		*addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA);
 
 		dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", *addr);
+		hl_capture_page_fault(hdev, *addr, 0, true);
+
 		WREG32(mmMMU_UP_PAGE_ERROR_CAPTURE, 0);
 	}
 
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index d6f84cb35e3d..2b794f54e2ed 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -778,6 +778,9 @@ enum hl_server_type {
  * HL_INFO_UNREGISTER_EVENTFD - Unregister eventfd
  * HL_INFO_GET_EVENTS         - Retrieve the last occurred events
  * HL_INFO_UNDEFINED_OPCODE_EVENT - Retrieve last undefined opcode error information.
+ * HL_INFO_ENGINE_STATUS - Retrieve the status of all the h/w engines in the asic.
+ * HL_INFO_PAGE_FAULT_EVENT - Retrieve parameters of captured page fault.
+ * HL_INFO_USER_MAPPINGS - Retrieve user mappings, captured after page fault event.
  */
 #define HL_INFO_HW_IP_INFO			0
 #define HL_INFO_HW_EVENTS			1
@@ -809,6 +812,8 @@ enum hl_server_type {
 #define HL_INFO_GET_EVENTS			30
 #define HL_INFO_UNDEFINED_OPCODE_EVENT		31
 #define HL_INFO_ENGINE_STATUS			32
+#define HL_INFO_PAGE_FAULT_EVENT		33
+#define HL_INFO_USER_MAPPINGS			34
 
 #define HL_INFO_VERSION_MAX_LEN			128
 #define HL_INFO_CARD_NAME_MAX_LEN		16
@@ -1187,6 +1192,29 @@ struct hl_info_sec_attest {
 	__u8 pad0[2];
 };
 
+/**
+ * struct hl_page_fault_info - page fault information.
+ * @timestamp: timestamp of page fault.
+ * @addr: address which accessing it caused page fault.
+ * @engine_id: engine id which caused the page fault, supported only in gaudi3.
+ */
+struct hl_page_fault_info {
+	__s64 timestamp;
+	__u64 addr;
+	__u16 engine_id;
+	__u8 pad[6];
+};
+
+/**
+ * struct hl_user_mapping - user mapping information.
+ * @dev_va: device virtual address.
+ * @size: virtual address mapping size.
+ */
+struct hl_user_mapping {
+	__u64 dev_va;
+	__u64 size;
+};
+
 enum gaudi_dcores {
 	HL_GAUDI_WS_DCORE,
 	HL_GAUDI_WN_DCORE,
@@ -1213,6 +1241,8 @@ enum gaudi_dcores {
  *                           needed, hence updating this variable so user will know the exact amount
  *                           of bytes copied by the kernel to the buffer.
  * @sec_attest_nonce: Nonce number used for attestation report.
+ * @array_size: Number of array members copied to user buffer.
+ *              Relevant for HL_INFO_USER_MAPPINGS info ioctl.
  * @pad: Padding to 64 bit.
  */
 struct hl_info_args {
@@ -1228,6 +1258,7 @@ struct hl_info_args {
 		__u32 eventfd;
 		__u32 user_buffer_actual_size;
 		__u32 sec_attest_nonce;
+		__u32 array_size;
 	};
 
 	__u32 pad;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 04/13] habanalabs: replace 'pf' to 'prefetch'
  2022-10-06  8:22 [PATCH 01/13] habanalabs: use lower_32_bits() Oded Gabbay
  2022-10-06  8:22 ` [PATCH 02/13] habanalabs/gaudi2: fix module ID for RAZWI handling Oded Gabbay
  2022-10-06  8:22 ` [PATCH 03/13] habanalabs: add page fault info uapi Oded Gabbay
@ 2022-10-06  8:22 ` Oded Gabbay
  2022-10-06  8:23 ` [PATCH 05/13] habanalabs/gaudi2: remove privileged MME clock configuration Oded Gabbay
                   ` (8 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Oded Gabbay @ 2022-10-06  8:22 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dafna Hirschfeld

From: Dafna Hirschfeld <dhirschfeld@habana.ai>

pf was an abbreviation for prefetch but because pf already stands
for 'physical function', we decided to change it to 'prefetch'.

Signed-off-by: Dafna Hirschfeld <dhirschfeld@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c     | 14 ++++++-------
 drivers/misc/habanalabs/common/habanalabs.h |  8 ++++----
 drivers/misc/habanalabs/common/mmu/mmu.c    | 22 ++++++++++-----------
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 5dc6c77b4721..bf675cf39f71 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -783,8 +783,8 @@ static int device_early_init(struct hl_device *hdev)
 		goto free_cs_cmplt_wq;
 	}
 
-	hdev->pf_wq = alloc_workqueue("hl-prefetch", WQ_UNBOUND, 0);
-	if (!hdev->pf_wq) {
+	hdev->prefetch_wq = alloc_workqueue("hl-prefetch", WQ_UNBOUND, 0);
+	if (!hdev->prefetch_wq) {
 		dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n");
 		rc = -ENOMEM;
 		goto free_ts_free_wq;
@@ -794,7 +794,7 @@ static int device_early_init(struct hl_device *hdev)
 					GFP_KERNEL);
 	if (!hdev->hl_chip_info) {
 		rc = -ENOMEM;
-		goto free_pf_wq;
+		goto free_prefetch_wq;
 	}
 
 	rc = hl_mmu_if_set_funcs(hdev);
@@ -833,8 +833,8 @@ static int device_early_init(struct hl_device *hdev)
 	hl_mem_mgr_fini(&hdev->kernel_mem_mgr);
 free_chip_info:
 	kfree(hdev->hl_chip_info);
-free_pf_wq:
-	destroy_workqueue(hdev->pf_wq);
+free_prefetch_wq:
+	destroy_workqueue(hdev->prefetch_wq);
 free_ts_free_wq:
 	destroy_workqueue(hdev->ts_free_obj_wq);
 free_cs_cmplt_wq:
@@ -877,7 +877,7 @@ static void device_early_fini(struct hl_device *hdev)
 
 	kfree(hdev->hl_chip_info);
 
-	destroy_workqueue(hdev->pf_wq);
+	destroy_workqueue(hdev->prefetch_wq);
 	destroy_workqueue(hdev->ts_free_obj_wq);
 	destroy_workqueue(hdev->cs_cmplt_wq);
 	destroy_workqueue(hdev->eq_wq);
@@ -1076,7 +1076,7 @@ static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_r
 	hl_cs_rollback_all(hdev, skip_wq_flush);
 
 	/* flush the MMU prefetch workqueue */
-	flush_workqueue(hdev->pf_wq);
+	flush_workqueue(hdev->prefetch_wq);
 
 	/* Release all pending user interrupts, each pending user interrupt
 	 * holds a reference to user context
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 1489240d5a3a..6d8ce4a1dbb1 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2811,7 +2811,7 @@ struct hl_mmu_funcs {
 
 /**
  * struct hl_prefetch_work - prefetch work structure handler
- * @pf_work: actual work struct.
+ * @prefetch_work: actual work struct.
  * @ctx: compute context.
  * @va: virtual address to pre-fetch.
  * @size: pre-fetch size.
@@ -2819,7 +2819,7 @@ struct hl_mmu_funcs {
  * @asid: ASID for maintenance operation.
  */
 struct hl_prefetch_work {
-	struct work_struct	pf_work;
+	struct work_struct	prefetch_work;
 	struct hl_ctx		*ctx;
 	u64			va;
 	u64			size;
@@ -3060,7 +3060,7 @@ struct hl_reset_info {
  * @cs_cmplt_wq: work queue of CS completions for executing work in process
  *               context.
  * @ts_free_obj_wq: work queue for timestamp registration objects release.
- * @pf_wq: work queue for MMU pre-fetch operations.
+ * @prefetch_wq: work queue for MMU pre-fetch operations.
  * @kernel_ctx: Kernel driver context structure.
  * @kernel_queues: array of hl_hw_queue.
  * @cs_mirror_list: CS mirror list for TDR.
@@ -3231,7 +3231,7 @@ struct hl_device {
 	struct workqueue_struct		*eq_wq;
 	struct workqueue_struct		*cs_cmplt_wq;
 	struct workqueue_struct		*ts_free_obj_wq;
-	struct workqueue_struct		*pf_wq;
+	struct workqueue_struct		*prefetch_wq;
 	struct hl_ctx			*kernel_ctx;
 	struct hl_hw_queue		*kernel_queues;
 	struct list_head		cs_mirror_list;
diff --git a/drivers/misc/habanalabs/common/mmu/mmu.c b/drivers/misc/habanalabs/common/mmu/mmu.c
index cf8946266615..589179f8cd41 100644
--- a/drivers/misc/habanalabs/common/mmu/mmu.c
+++ b/drivers/misc/habanalabs/common/mmu/mmu.c
@@ -699,7 +699,7 @@ int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard,
 
 static void hl_mmu_prefetch_work_function(struct work_struct *work)
 {
-	struct hl_prefetch_work *pfw = container_of(work, struct hl_prefetch_work, pf_work);
+	struct hl_prefetch_work *pfw = container_of(work, struct hl_prefetch_work, prefetch_work);
 	struct hl_ctx *ctx = pfw->ctx;
 	struct hl_device *hdev = ctx->hdev;
 
@@ -723,25 +723,25 @@ static void hl_mmu_prefetch_work_function(struct work_struct *work)
 
 int hl_mmu_prefetch_cache_range(struct hl_ctx *ctx, u32 flags, u32 asid, u64 va, u64 size)
 {
-	struct hl_prefetch_work *handle_pf_work;
+	struct hl_prefetch_work *handle_prefetch_work;
 
-	handle_pf_work = kmalloc(sizeof(*handle_pf_work), GFP_KERNEL);
-	if (!handle_pf_work)
+	handle_prefetch_work = kmalloc(sizeof(*handle_prefetch_work), GFP_KERNEL);
+	if (!handle_prefetch_work)
 		return -ENOMEM;
 
-	INIT_WORK(&handle_pf_work->pf_work, hl_mmu_prefetch_work_function);
-	handle_pf_work->ctx = ctx;
-	handle_pf_work->va = va;
-	handle_pf_work->size = size;
-	handle_pf_work->flags = flags;
-	handle_pf_work->asid = asid;
+	INIT_WORK(&handle_prefetch_work->prefetch_work, hl_mmu_prefetch_work_function);
+	handle_prefetch_work->ctx = ctx;
+	handle_prefetch_work->va = va;
+	handle_prefetch_work->size = size;
+	handle_prefetch_work->flags = flags;
+	handle_prefetch_work->asid = asid;
 
 	/*
 	 * as actual prefetch is done in a WQ we must get the context (and put it
 	 * at the end of the work function)
 	 */
 	hl_ctx_get(ctx);
-	queue_work(ctx->hdev->pf_wq, &handle_pf_work->pf_work);
+	queue_work(ctx->hdev->prefetch_wq, &handle_prefetch_work->prefetch_work);
 
 	return 0;
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 05/13] habanalabs/gaudi2: remove privileged MME clock configuration
  2022-10-06  8:22 [PATCH 01/13] habanalabs: use lower_32_bits() Oded Gabbay
                   ` (2 preceding siblings ...)
  2022-10-06  8:22 ` [PATCH 04/13] habanalabs: replace 'pf' to 'prefetch' Oded Gabbay
@ 2022-10-06  8:23 ` Oded Gabbay
  2022-10-06  8:23 ` [PATCH 06/13] habanalabs/gaudi2: add device unavailable notification Oded Gabbay
                   ` (7 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Oded Gabbay @ 2022-10-06  8:23 UTC (permalink / raw)
  To: linux-kernel; +Cc: Koby Elbaz

From: Koby Elbaz <kelbaz@habana.ai>

Privileged MME clock configuration is removed as it is done by the f/w.

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index a7eccc41d508..90e1d7fcb17a 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -4535,7 +4535,7 @@ static void gaudi2_init_mme_acc(struct hl_device *hdev, u32 reg_base)
 static void gaudi2_init_dcore_mme(struct hl_device *hdev, int dcore_id,
 							bool config_qman_only)
 {
-	u32 queue_id_base, reg_base, clk_en_addr = 0;
+	u32 queue_id_base, reg_base;
 
 	switch (dcore_id) {
 	case 0:
@@ -4543,23 +4543,18 @@ static void gaudi2_init_dcore_mme(struct hl_device *hdev, int dcore_id,
 		break;
 	case 1:
 		queue_id_base = GAUDI2_QUEUE_ID_DCORE1_MME_0_0;
-		clk_en_addr = mmDCORE1_MME_CTRL_LO_QM_SLV_CLK_EN;
 		break;
 	case 2:
 		queue_id_base = GAUDI2_QUEUE_ID_DCORE2_MME_0_0;
 		break;
 	case 3:
 		queue_id_base = GAUDI2_QUEUE_ID_DCORE3_MME_0_0;
-		clk_en_addr = mmDCORE3_MME_CTRL_LO_QM_SLV_CLK_EN;
 		break;
 	default:
 		dev_err(hdev->dev, "Invalid dcore id %u\n", dcore_id);
 		return;
 	}
 
-	if (clk_en_addr && !(hdev->fw_components & FW_TYPE_BOOT_CPU))
-		WREG32(clk_en_addr, 0x1);
-
 	if (!config_qman_only) {
 		reg_base = gaudi2_mme_acc_blocks_bases[dcore_id];
 		gaudi2_init_mme_acc(hdev, reg_base);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 06/13] habanalabs/gaudi2: add device unavailable notification
  2022-10-06  8:22 [PATCH 01/13] habanalabs: use lower_32_bits() Oded Gabbay
                   ` (3 preceding siblings ...)
  2022-10-06  8:23 ` [PATCH 05/13] habanalabs/gaudi2: remove privileged MME clock configuration Oded Gabbay
@ 2022-10-06  8:23 ` Oded Gabbay
  2022-10-06  8:23 ` [PATCH 07/13] habanalabs: skip idle status check if reset on device release Oded Gabbay
                   ` (6 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Oded Gabbay @ 2022-10-06  8:23 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tal Cohen

From: Tal Cohen <talcohen@habana.ai>

Device unavailable notifies the user that there isn't an option to
retrieve debug information from the device.
When a critical device error occurs and the f/w performs the device
reset, a device unavailable notification shall be sent to the user
process.

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 90e1d7fcb17a..e05ffaa047a2 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -8576,7 +8576,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 {
 	u32 ctl, reset_flags = HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY;
 	struct gaudi2_device *gaudi2 = hdev->asic_specific;
-	bool reset_required = false, skip_reset = false;
+	bool reset_required = false, skip_reset = false, is_critical = false;
 	int index, sbte_index;
 	u64 event_mask = 0;
 	u16 event_type;
@@ -8602,6 +8602,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
 		reset_required = gaudi2_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
+		is_critical = eq_entry->ecc_data.is_critical;
 		break;
 
 	case GAUDI2_EVENT_TPC0_QM ... GAUDI2_EVENT_PDMA1_QM:
@@ -8976,9 +8977,16 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 	return;
 
 reset_device:
-	if (hdev->hard_reset_on_fw_events) {
+	if (hdev->asic_prop.fw_security_enabled && is_critical) {
+		reset_flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW;
+
+		/* notify on device unavailable while the reset triggered by fw */
+		event_mask |= (HL_NOTIFIER_EVENT_DEVICE_RESET |
+					HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE);
 		hl_device_reset(hdev, reset_flags);
+	} else if (hdev->hard_reset_on_fw_events) {
 		event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
+		hl_device_reset(hdev, reset_flags);
 	} else {
 		if (!gaudi2_irq_map_table[event_type].msg)
 			hl_fw_unmask_irq(hdev, event_type);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 07/13] habanalabs: skip idle status check if reset on device release
  2022-10-06  8:22 [PATCH 01/13] habanalabs: use lower_32_bits() Oded Gabbay
                   ` (4 preceding siblings ...)
  2022-10-06  8:23 ` [PATCH 06/13] habanalabs/gaudi2: add device unavailable notification Oded Gabbay
@ 2022-10-06  8:23 ` Oded Gabbay
  2022-10-06  8:23 ` [PATCH 08/13] habanalabs: allow unregistering eventfd when device non-operational Oded Gabbay
                   ` (5 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Oded Gabbay @ 2022-10-06  8:23 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

If reset upon device release is enabled, there is no need to check the
device idle status in hpriv_release(), because device is going to be
reset in any case.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index bf675cf39f71..e60ed0c8a9db 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -398,16 +398,14 @@ static void hpriv_release(struct kref *ref)
 	mutex_destroy(&hpriv->ctx_lock);
 	mutex_destroy(&hpriv->restore_phase_mutex);
 
-	if ((!hdev->pldm) && (hdev->pdev) &&
-			(!hdev->asic_funcs->is_device_idle(hdev,
-				idle_mask,
-				HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL))) {
-		dev_err(hdev->dev,
-			"device not idle after user context is closed (0x%llx_%llx)\n",
-			idle_mask[1], idle_mask[0]);
+	/* No need for idle status check if device is going to be reset in any case */
+	if (!hdev->reset_upon_device_release && hdev->pdev && !hdev->pldm)
+		device_is_idle = hdev->asic_funcs->is_device_idle(hdev, idle_mask,
+							HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL);
 
-		device_is_idle = false;
-	}
+	if (!device_is_idle)
+		dev_err(hdev->dev, "device not idle after user context is closed (0x%llx_%llx)\n",
+			idle_mask[1], idle_mask[0]);
 
 	/* We need to remove the user from the list to make sure the reset process won't
 	 * try to kill the user process. Because, if we got here, it means there are no
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 08/13] habanalabs: allow unregistering eventfd when device non-operational
  2022-10-06  8:22 [PATCH 01/13] habanalabs: use lower_32_bits() Oded Gabbay
                   ` (5 preceding siblings ...)
  2022-10-06  8:23 ` [PATCH 07/13] habanalabs: skip idle status check if reset on device release Oded Gabbay
@ 2022-10-06  8:23 ` Oded Gabbay
  2022-10-06  8:23 ` [PATCH 09/13] habanalabs: move reset workqueue to be under hl_device Oded Gabbay
                   ` (4 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Oded Gabbay @ 2022-10-06  8:23 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

Unregistering eventfd is for releasing host resources and doesn't
involve an access to the device. As such, there is no reason to disallow
it when device isn't operational.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs_ioctl.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index cac2c7fb14f1..5ce5c42e2731 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -879,6 +879,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_USER_MAPPINGS:
 		return user_mappings_info(hpriv, args);
 
+	case HL_INFO_UNREGISTER_EVENTFD:
+		return eventfd_unregister(hpriv, args);
+
 	default:
 		break;
 	}
@@ -935,9 +938,6 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_REGISTER_EVENTFD:
 		return eventfd_register(hpriv, args);
 
-	case HL_INFO_UNREGISTER_EVENTFD:
-		return eventfd_unregister(hpriv, args);
-
 	case HL_INFO_ENGINE_STATUS:
 		return engine_status_info(hpriv, args);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 09/13] habanalabs: move reset workqueue to be under hl_device
  2022-10-06  8:22 [PATCH 01/13] habanalabs: use lower_32_bits() Oded Gabbay
                   ` (6 preceding siblings ...)
  2022-10-06  8:23 ` [PATCH 08/13] habanalabs: allow unregistering eventfd when device non-operational Oded Gabbay
@ 2022-10-06  8:23 ` Oded Gabbay
  2022-10-06  8:23 ` [PATCH 10/13] habanalabs: handle HBM MMU when capturing page fault data Oded Gabbay
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Oded Gabbay @ 2022-10-06  8:23 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

'struct hl_device_reset_work' is used as a wrapper for the reset work
and its parameters, including the reset workqueue on which it runs.
In a future commit, another reset related work with similar parameters
is going to be added, but it won't use the reset workqueue.

As in any case there is a single reset workqueue, and to allow the resue
of this structure, move the reset workqueue to 'struct hl_device'.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c     | 15 ++++++---------
 drivers/misc/habanalabs/common/habanalabs.h | 12 ++++++------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index e60ed0c8a9db..e9b373a8cdad 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -684,9 +684,8 @@ static void device_hard_reset_pending(struct work_struct *work)
 			"Could not reset device. will try again in %u seconds",
 			HL_PENDING_RESET_PER_SEC);
 
-		queue_delayed_work(device_reset_work->wq,
-			&device_reset_work->reset_work,
-			msecs_to_jiffies(HL_PENDING_RESET_PER_SEC * 1000));
+		queue_delayed_work(hdev->reset_wq, &device_reset_work->reset_work,
+					msecs_to_jiffies(HL_PENDING_RESET_PER_SEC * 1000));
 	}
 }
 
@@ -801,9 +800,8 @@ static int device_early_init(struct hl_device *hdev)
 
 	hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr);
 
-	hdev->device_reset_work.wq =
-			create_singlethread_workqueue("hl_device_reset");
-	if (!hdev->device_reset_work.wq) {
+	hdev->reset_wq = create_singlethread_workqueue("hl_device_reset");
+	if (!hdev->reset_wq) {
 		rc = -ENOMEM;
 		dev_err(hdev->dev, "Failed to create device reset WQ\n");
 		goto free_cb_mgr;
@@ -879,7 +877,7 @@ static void device_early_fini(struct hl_device *hdev)
 	destroy_workqueue(hdev->ts_free_obj_wq);
 	destroy_workqueue(hdev->cs_cmplt_wq);
 	destroy_workqueue(hdev->eq_wq);
-	destroy_workqueue(hdev->device_reset_work.wq);
+	destroy_workqueue(hdev->reset_wq);
 
 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
 		destroy_workqueue(hdev->cq_wq[i]);
@@ -1460,8 +1458,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		 * Because the reset function can't run from heartbeat work,
 		 * we need to call the reset function from a dedicated work.
 		 */
-		queue_delayed_work(hdev->device_reset_work.wq,
-			&hdev->device_reset_work.reset_work, 0);
+		queue_delayed_work(hdev->reset_wq, &hdev->device_reset_work.reset_work, 0);
 
 		return 0;
 	}
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 6d8ce4a1dbb1..4913197c433e 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2682,17 +2682,15 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
 struct hwmon_chip_info;
 
 /**
- * struct hl_device_reset_work - reset workqueue task wrapper.
- * @wq: work queue for device reset procedure.
+ * struct hl_device_reset_work - reset work wrapper.
  * @reset_work: reset work to be done.
  * @hdev: habanalabs device structure.
  * @flags: reset flags.
  */
 struct hl_device_reset_work {
-	struct workqueue_struct		*wq;
-	struct delayed_work		reset_work;
-	struct hl_device		*hdev;
-	u32				flags;
+	struct delayed_work	reset_work;
+	struct hl_device	*hdev;
+	u32			flags;
 };
 
 /**
@@ -3061,6 +3059,7 @@ struct hl_reset_info {
  *               context.
  * @ts_free_obj_wq: work queue for timestamp registration objects release.
  * @prefetch_wq: work queue for MMU pre-fetch operations.
+ * @reset_wq: work queue for device reset procedure.
  * @kernel_ctx: Kernel driver context structure.
  * @kernel_queues: array of hl_hw_queue.
  * @cs_mirror_list: CS mirror list for TDR.
@@ -3232,6 +3231,7 @@ struct hl_device {
 	struct workqueue_struct		*cs_cmplt_wq;
 	struct workqueue_struct		*ts_free_obj_wq;
 	struct workqueue_struct		*prefetch_wq;
+	struct workqueue_struct		*reset_wq;
 	struct hl_ctx			*kernel_ctx;
 	struct hl_hw_queue		*kernel_queues;
 	struct list_head		cs_mirror_list;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 10/13] habanalabs: handle HBM MMU when capturing page fault data
  2022-10-06  8:22 [PATCH 01/13] habanalabs: use lower_32_bits() Oded Gabbay
                   ` (7 preceding siblings ...)
  2022-10-06  8:23 ` [PATCH 09/13] habanalabs: move reset workqueue to be under hl_device Oded Gabbay
@ 2022-10-06  8:23 ` Oded Gabbay
  2022-10-06  8:23 ` [PATCH 11/13] habanalabs/gaudi2: capture RAZWI information Oded Gabbay
                   ` (2 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Oded Gabbay @ 2022-10-06  8:23 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

In case of HBM MMU page fault, capture its relevant mappings.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 29 ++++++++++++++++++-------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index e9b373a8cdad..b8b32285720d 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -2273,15 +2273,20 @@ void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_
 			num_of_engines * sizeof(u16));
 	hdev->captured_err_info.razwi.flags = flags;
 }
-static void hl_capture_user_mappings(struct hl_device *hdev)
+static void hl_capture_user_mappings(struct hl_device *hdev, bool is_pmmu)
 {
 	struct page_fault_info *pgf_info = &hdev->captured_err_info.pgf_info;
+	struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
 	struct hl_vm_hash_node *hnode;
 	struct hl_userptr *userptr;
+	enum vm_type *vm_type;
 	struct hl_ctx *ctx;
 	u32 map_idx = 0;
 	int i;
 
+	/* Reset previous session count*/
+	pgf_info->num_of_user_mappings = 0;
+
 	ctx = hl_get_compute_ctx(hdev);
 	if (!ctx) {
 		dev_err(hdev->dev, "Can't get user context for user mappings\n");
@@ -2290,7 +2295,7 @@ static void hl_capture_user_mappings(struct hl_device *hdev)
 
 	mutex_lock(&ctx->mem_hash_lock);
 	hash_for_each(ctx->mem_hash, i, hnode, node)
-	pgf_info->num_of_user_mappings++;
+		pgf_info->num_of_user_mappings++;
 
 	if (!pgf_info->num_of_user_mappings)
 		goto finish;
@@ -2300,17 +2305,25 @@ static void hl_capture_user_mappings(struct hl_device *hdev)
 	 */
 	vfree(pgf_info->user_mappings);
 	pgf_info->user_mappings =
-			vmalloc(pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping));
+			vzalloc(pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping));
 	if (!pgf_info->user_mappings) {
 		pgf_info->num_of_user_mappings = 0;
 		goto finish;
 	}
 
 	hash_for_each(ctx->mem_hash, i, hnode, node) {
-		userptr = hnode->ptr;
-		pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr;
-		pgf_info->user_mappings[map_idx].size = userptr->size;
-		map_idx++;
+		vm_type = hnode->ptr;
+		if ((*vm_type == VM_TYPE_USERPTR) && (is_pmmu)) {
+			userptr = hnode->ptr;
+			pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr;
+			pgf_info->user_mappings[map_idx].size = userptr->size;
+			map_idx++;
+		} else if ((*vm_type == VM_TYPE_PHYS_PACK) && (!is_pmmu)) {
+			phys_pg_pack = hnode->ptr;
+			pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr;
+			pgf_info->user_mappings[map_idx].size = phys_pg_pack->total_size;
+			map_idx++;
+		}
 	}
 finish:
 	mutex_unlock(&ctx->mem_hash_lock);
@@ -2326,5 +2339,5 @@ void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is
 	hdev->captured_err_info.pgf_info.pgf.timestamp = ktime_to_ns(ktime_get());
 	hdev->captured_err_info.pgf_info.pgf.addr = addr;
 	hdev->captured_err_info.pgf_info.pgf.engine_id = eng_id;
-	hl_capture_user_mappings(hdev);
+	hl_capture_user_mappings(hdev, is_pmmu);
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 11/13] habanalabs/gaudi2: capture RAZWI information
  2022-10-06  8:22 [PATCH 01/13] habanalabs: use lower_32_bits() Oded Gabbay
                   ` (8 preceding siblings ...)
  2022-10-06  8:23 ` [PATCH 10/13] habanalabs: handle HBM MMU when capturing page fault data Oded Gabbay
@ 2022-10-06  8:23 ` Oded Gabbay
  2022-10-06  8:23 ` [PATCH 12/13] habanalabs/gaudi2: capture page fault data Oded Gabbay
  2022-10-06  8:23 ` [PATCH 13/13] habanalabs: verify no zero event is sent Oded Gabbay
  11 siblings, 0 replies; 13+ messages in thread
From: Oded Gabbay @ 2022-10-06  8:23 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

Added function to calculate possible engines which caused
RAZWI (read-only zero, write ignored), from a given router id or
module index.

When getting RAZWI via PSOC IP, first the router id is calculated
and then the possible engines that caused the RAZWI are calculated.

There is a possibility that the RAZWI initiator is not an engine. In
that case, it will not be included in possible engines as it
doesn't have an engine id.

RAZWI information is captured when receiving event from engine or via
PSOC IP.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 255 ++++++++++++++++++++++--
 include/uapi/misc/habanalabs.h          |   4 +
 2 files changed, 242 insertions(+), 17 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index e05ffaa047a2..13a5356f1ec3 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -128,6 +128,8 @@
 #define GAUDI2_VDEC_MSIX_ENTRIES		(GAUDI2_IRQ_NUM_SHARED_DEC1_ABNRM - \
 							GAUDI2_IRQ_NUM_DCORE0_DEC0_NRM + 1)
 
+#define ENGINE_ID_DCORE_OFFSET (GAUDI2_DCORE1_ENGINE_ID_EDMA_0 - GAUDI2_DCORE0_ENGINE_ID_EDMA_0)
+
 enum hl_pmmu_fatal_cause {
 	LATENCY_RD_OUT_FIFO_OVERRUN,
 	LATENCY_WR_OUT_FIFO_OVERRUN,
@@ -7092,9 +7094,12 @@ static void gaudi2_handle_qman_err_generic(struct hl_device *hdev, const char *q
 
 static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev,
 			u64 rtr_mstr_if_base_addr, bool is_write, char *name,
-			bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info)
+			bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info,
+			enum gaudi2_engine_id id)
 {
 	u32 razwi_hi, razwi_lo, razwi_xy;
+	u16 eng_id = id;
+	u8 rd_wr_flag;
 
 	if (is_write) {
 		if (read_razwi_regs) {
@@ -7106,6 +7111,7 @@ static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev,
 			razwi_lo = le32_to_cpu(razwi_info->hbw.rr_aw_razwi_lo_reg);
 			razwi_xy = le32_to_cpu(razwi_info->hbw.rr_aw_razwi_id_reg);
 		}
+		rd_wr_flag = HL_RAZWI_WRITE;
 	} else {
 		if (read_razwi_regs) {
 			razwi_hi = RREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_HI);
@@ -7116,8 +7122,12 @@ static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev,
 			razwi_lo = le32_to_cpu(razwi_info->hbw.rr_ar_razwi_lo_reg);
 			razwi_xy = le32_to_cpu(razwi_info->hbw.rr_ar_razwi_id_reg);
 		}
+		rd_wr_flag = HL_RAZWI_READ;
 	}
 
+	hl_capture_razwi(hdev, (u64)razwi_hi << 32 | razwi_lo, &eng_id, 1,
+				rd_wr_flag | HL_RAZWI_HBW);
+
 	dev_err_ratelimited(hdev->dev,
 		"%s-RAZWI SHARED RR HBW %s error, address %#llx, Initiator coordinates 0x%x\n",
 		name, is_write ? "WR" : "RD", (u64)razwi_hi << 32 | razwi_lo, razwi_xy);
@@ -7125,9 +7135,12 @@ static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev,
 
 static void gaudi2_razwi_rr_lbw_shared_printf_info(struct hl_device *hdev,
 			u64 rtr_mstr_if_base_addr, bool is_write, char *name,
-			bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info)
+			bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info,
+			enum gaudi2_engine_id id)
 {
 	u32 razwi_addr, razwi_xy;
+	u16 eng_id = id;
+	u8 rd_wr_flag;
 
 	if (is_write) {
 		if (read_razwi_regs) {
@@ -7138,9 +7151,7 @@ static void gaudi2_razwi_rr_lbw_shared_printf_info(struct hl_device *hdev,
 			razwi_xy = le32_to_cpu(razwi_info->lbw.rr_aw_razwi_id_reg);
 		}
 
-		dev_err_ratelimited(hdev->dev,
-			"%s-RAZWI SHARED RR LBW WR error, mstr_if 0x%llx, captured address 0x%x, Initiator coordinates 0x%x\n",
-			name, rtr_mstr_if_base_addr, razwi_addr, razwi_xy);
+		rd_wr_flag = HL_RAZWI_WRITE;
 	} else {
 		if (read_razwi_regs) {
 			razwi_addr = RREG32(rtr_mstr_if_base_addr + RR_SHRD_LBW_AR_RAZWI);
@@ -7150,9 +7161,57 @@ static void gaudi2_razwi_rr_lbw_shared_printf_info(struct hl_device *hdev,
 			razwi_xy = le32_to_cpu(razwi_info->lbw.rr_ar_razwi_id_reg);
 		}
 
-		dev_err_ratelimited(hdev->dev,
-			"%s-RAZWI SHARED RR LBW AR error, mstr_if 0x%llx, captured address 0x%x Initiator coordinates 0x%x\n",
-			name, rtr_mstr_if_base_addr, razwi_addr, razwi_xy);
+		rd_wr_flag = HL_RAZWI_READ;
+	}
+
+	hl_capture_razwi(hdev, razwi_addr, &eng_id, 1, rd_wr_flag | HL_RAZWI_LBW);
+	dev_err_ratelimited(hdev->dev,
+				"%s-RAZWI SHARED RR LBW %s error, mstr_if 0x%llx, captured address 0x%x Initiator coordinates 0x%x\n",
+				name, is_write ? "WR" : "RD", rtr_mstr_if_base_addr, razwi_addr,
+						razwi_xy);
+}
+
+static enum gaudi2_engine_id gaudi2_razwi_calc_engine_id(struct hl_device *hdev,
+						enum razwi_event_sources module, u8 module_idx)
+{
+	switch (module) {
+	case RAZWI_TPC:
+		if (module_idx == (NUM_OF_TPC_PER_DCORE * NUM_OF_DCORES))
+			return GAUDI2_DCORE0_ENGINE_ID_TPC_6;
+		return (((module_idx / NUM_OF_TPC_PER_DCORE) * ENGINE_ID_DCORE_OFFSET) +
+				(module_idx % NUM_OF_TPC_PER_DCORE) +
+				(GAUDI2_DCORE0_ENGINE_ID_TPC_0 - GAUDI2_DCORE0_ENGINE_ID_EDMA_0));
+
+	case RAZWI_MME:
+		return ((GAUDI2_DCORE0_ENGINE_ID_MME - GAUDI2_DCORE0_ENGINE_ID_EDMA_0) +
+			(module_idx * ENGINE_ID_DCORE_OFFSET));
+
+	case RAZWI_EDMA:
+		return (((module_idx / NUM_OF_EDMA_PER_DCORE) * ENGINE_ID_DCORE_OFFSET) +
+			(module_idx % NUM_OF_EDMA_PER_DCORE));
+
+	case RAZWI_PDMA:
+		return (GAUDI2_ENGINE_ID_PDMA_0 + module_idx);
+
+	case RAZWI_NIC:
+		return (GAUDI2_ENGINE_ID_NIC0_0 + (NIC_NUMBER_OF_QM_PER_MACRO * module_idx));
+
+	case RAZWI_DEC:
+		if (module_idx == 8)
+			return GAUDI2_PCIE_ENGINE_ID_DEC_0;
+
+		if (module_idx == 9)
+			return GAUDI2_PCIE_ENGINE_ID_DEC_1;
+					;
+		return (((module_idx / NUM_OF_DEC_PER_DCORE) * ENGINE_ID_DCORE_OFFSET) +
+				(module_idx % NUM_OF_DEC_PER_DCORE) +
+				(GAUDI2_DCORE0_ENGINE_ID_DEC_0 - GAUDI2_DCORE0_ENGINE_ID_EDMA_0));
+
+	case RAZWI_ROT:
+		return GAUDI2_ENGINE_ID_ROT_0 + module_idx;
+
+	default:
+		return GAUDI2_ENGINE_ID_SIZE;
 	}
 }
 
@@ -7165,7 +7224,7 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 				u8 module_sub_idx, struct hl_eq_razwi_info *razwi_info)
 {
 	bool via_sft = false, read_razwi_regs = false;
-	u32 rtr_id, dcore_id, dcore_rtr_id, sft_id;
+	u32 rtr_id, dcore_id, dcore_rtr_id, sft_id, eng_id;
 	u64 rtr_mstr_if_base_addr;
 	u32 hbw_shrd_aw = 0, hbw_shrd_ar = 0;
 	u32 lbw_shrd_aw = 0, lbw_shrd_ar = 0;
@@ -7299,9 +7358,11 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 	if (!hbw_shrd_aw && !hbw_shrd_ar && !lbw_shrd_aw && !lbw_shrd_ar)
 		return;
 
+	eng_id = gaudi2_razwi_calc_engine_id(hdev, module, module_idx);
 	if (hbw_shrd_aw) {
 		gaudi2_razwi_rr_hbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, true,
-						initiator_name, read_razwi_regs, razwi_info);
+						initiator_name, read_razwi_regs, razwi_info,
+						eng_id);
 
 		/* Clear event indication */
 		if (read_razwi_regs)
@@ -7310,7 +7371,8 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 
 	if (hbw_shrd_ar) {
 		gaudi2_razwi_rr_hbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, false,
-						initiator_name, read_razwi_regs, razwi_info);
+						initiator_name, read_razwi_regs, razwi_info,
+						eng_id);
 
 		/* Clear event indication */
 		if (read_razwi_regs)
@@ -7319,7 +7381,8 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 
 	if (lbw_shrd_aw) {
 		gaudi2_razwi_rr_lbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, true,
-						initiator_name, read_razwi_regs, razwi_info);
+						initiator_name, read_razwi_regs, razwi_info,
+						eng_id);
 
 		/* Clear event indication */
 		if (read_razwi_regs)
@@ -7328,7 +7391,8 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 
 	if (lbw_shrd_ar) {
 		gaudi2_razwi_rr_lbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, false,
-						initiator_name, read_razwi_regs, razwi_info);
+						initiator_name, read_razwi_regs, razwi_info,
+						eng_id);
 
 		/* Clear event indication */
 		if (read_razwi_regs)
@@ -7450,25 +7514,175 @@ static const char *gaudi2_get_initiators_name(u32 rtr_id)
 	}
 }
 
+static u16 gaudi2_get_razwi_initiators(u32 rtr_id, u16 *engines)
+{
+	switch (rtr_id) {
+	case DCORE0_RTR0:
+		engines[0] = GAUDI2_DCORE0_ENGINE_ID_DEC_0;
+		engines[1] = GAUDI2_DCORE0_ENGINE_ID_DEC_1;
+		engines[2] = GAUDI2_PCIE_ENGINE_ID_DEC_0;
+		engines[3] = GAUDI2_PCIE_ENGINE_ID_DEC_1;
+		engines[4] = GAUDI2_DCORE0_ENGINE_ID_TPC_6;
+		engines[5] = GAUDI2_ENGINE_ID_PDMA_0;
+		engines[6] = GAUDI2_ENGINE_ID_PDMA_1;
+		engines[7] = GAUDI2_ENGINE_ID_PCIE;
+		engines[8] = GAUDI2_DCORE0_ENGINE_ID_EDMA_0;
+		engines[9] = GAUDI2_DCORE1_ENGINE_ID_EDMA_0;
+		engines[10] = GAUDI2_ENGINE_ID_PSOC;
+		return 11;
+
+	case DCORE0_RTR1:
+		engines[0] = GAUDI2_DCORE0_ENGINE_ID_TPC_0;
+		engines[1] = GAUDI2_DCORE0_ENGINE_ID_TPC_1;
+		return 2;
+
+	case DCORE0_RTR2:
+		engines[0] = GAUDI2_DCORE0_ENGINE_ID_TPC_2;
+		engines[1] = GAUDI2_DCORE0_ENGINE_ID_TPC_3;
+		return 2;
+
+	case DCORE0_RTR3:
+		engines[0] = GAUDI2_DCORE0_ENGINE_ID_TPC_4;
+		engines[1] = GAUDI2_DCORE0_ENGINE_ID_TPC_5;
+		return 2;
+
+	case DCORE0_RTR4:
+	case DCORE0_RTR5:
+	case DCORE0_RTR6:
+	case DCORE0_RTR7:
+		engines[0] = GAUDI2_DCORE0_ENGINE_ID_MME;
+		return 1;
+
+	case DCORE1_RTR0:
+	case DCORE1_RTR1:
+	case DCORE1_RTR2:
+	case DCORE1_RTR3:
+		engines[0] = GAUDI2_DCORE1_ENGINE_ID_MME;
+		return 1;
+
+	case DCORE1_RTR4:
+		engines[0] = GAUDI2_DCORE1_ENGINE_ID_TPC_4;
+		engines[1] = GAUDI2_DCORE1_ENGINE_ID_TPC_5;
+		return 2;
+
+	case DCORE1_RTR5:
+		engines[0] = GAUDI2_DCORE1_ENGINE_ID_TPC_2;
+		engines[1] = GAUDI2_DCORE1_ENGINE_ID_TPC_3;
+		return 2;
+
+	case DCORE1_RTR6:
+		engines[0] = GAUDI2_DCORE1_ENGINE_ID_TPC_0;
+		engines[1] = GAUDI2_DCORE1_ENGINE_ID_TPC_1;
+		return 2;
+
+	case DCORE1_RTR7:
+		engines[0] = GAUDI2_DCORE1_ENGINE_ID_DEC_0;
+		engines[1] = GAUDI2_DCORE1_ENGINE_ID_DEC_1;
+		engines[2] = GAUDI2_ENGINE_ID_NIC0_0;
+		engines[3] = GAUDI2_ENGINE_ID_NIC1_0;
+		engines[4] = GAUDI2_ENGINE_ID_NIC2_0;
+		engines[5] = GAUDI2_ENGINE_ID_NIC3_0;
+		engines[6] = GAUDI2_ENGINE_ID_NIC4_0;
+		engines[7] = GAUDI2_ENGINE_ID_ARC_FARM;
+		engines[8] = GAUDI2_ENGINE_ID_KDMA;
+		engines[9] = GAUDI2_DCORE0_ENGINE_ID_EDMA_1;
+		engines[10] = GAUDI2_DCORE1_ENGINE_ID_EDMA_1;
+		return 11;
+
+	case DCORE2_RTR0:
+		engines[0] = GAUDI2_DCORE2_ENGINE_ID_DEC_0;
+		engines[1] = GAUDI2_DCORE2_ENGINE_ID_DEC_1;
+		engines[2] = GAUDI2_ENGINE_ID_NIC5_0;
+		engines[3] = GAUDI2_ENGINE_ID_NIC6_0;
+		engines[4] = GAUDI2_ENGINE_ID_NIC7_0;
+		engines[5] = GAUDI2_ENGINE_ID_NIC8_0;
+		engines[6] = GAUDI2_DCORE2_ENGINE_ID_EDMA_0;
+		engines[7] = GAUDI2_DCORE3_ENGINE_ID_EDMA_0;
+		engines[8] = GAUDI2_ENGINE_ID_ROT_0;
+		return 9;
+
+	case DCORE2_RTR1:
+		engines[0] = GAUDI2_DCORE2_ENGINE_ID_TPC_4;
+		engines[1] = GAUDI2_DCORE2_ENGINE_ID_TPC_5;
+		return 2;
+
+	case DCORE2_RTR2:
+		engines[0] = GAUDI2_DCORE2_ENGINE_ID_TPC_2;
+		engines[1] = GAUDI2_DCORE2_ENGINE_ID_TPC_3;
+		return 2;
+
+	case DCORE2_RTR3:
+		engines[0] = GAUDI2_DCORE2_ENGINE_ID_TPC_0;
+		engines[1] = GAUDI2_DCORE2_ENGINE_ID_TPC_1;
+		return 2;
+
+	case DCORE2_RTR4:
+	case DCORE2_RTR5:
+	case DCORE2_RTR6:
+	case DCORE2_RTR7:
+		engines[0] = GAUDI2_DCORE2_ENGINE_ID_MME;
+		return 1;
+	case DCORE3_RTR0:
+	case DCORE3_RTR1:
+	case DCORE3_RTR2:
+	case DCORE3_RTR3:
+		engines[0] = GAUDI2_DCORE3_ENGINE_ID_MME;
+		return 1;
+	case DCORE3_RTR4:
+		engines[0] = GAUDI2_DCORE3_ENGINE_ID_TPC_0;
+		engines[1] = GAUDI2_DCORE3_ENGINE_ID_TPC_1;
+		return 2;
+	case DCORE3_RTR5:
+		engines[0] = GAUDI2_DCORE3_ENGINE_ID_TPC_2;
+		engines[1] = GAUDI2_DCORE3_ENGINE_ID_TPC_3;
+		return 2;
+	case DCORE3_RTR6:
+		engines[0] = GAUDI2_DCORE3_ENGINE_ID_TPC_4;
+		engines[1] = GAUDI2_DCORE3_ENGINE_ID_TPC_5;
+		return 2;
+	case DCORE3_RTR7:
+		engines[0] = GAUDI2_DCORE3_ENGINE_ID_DEC_0;
+		engines[1] = GAUDI2_DCORE3_ENGINE_ID_DEC_1;
+		engines[2] = GAUDI2_ENGINE_ID_NIC9_0;
+		engines[3] = GAUDI2_ENGINE_ID_NIC10_0;
+		engines[4] = GAUDI2_ENGINE_ID_NIC11_0;
+		engines[5] = GAUDI2_DCORE2_ENGINE_ID_EDMA_1;
+		engines[6] = GAUDI2_DCORE3_ENGINE_ID_EDMA_1;
+		engines[7] = GAUDI2_ENGINE_ID_ROT_1;
+		engines[8] = GAUDI2_ENGINE_ID_ROT_0;
+		return 9;
+	default:
+		return 0;
+	}
+}
+
 static void gaudi2_razwi_unmapped_addr_hbw_printf_info(struct hl_device *hdev, u32 rtr_id,
 							u64 rtr_ctrl_base_addr, bool is_write)
 {
+	u16 engines[HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR], num_of_eng;
 	u32 razwi_hi, razwi_lo;
+	u8 rd_wr_flag;
+
+	num_of_eng = gaudi2_get_razwi_initiators(rtr_id, &engines[0]);
 
 	if (is_write) {
 		razwi_hi = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AW_ADDR_HI);
 		razwi_lo = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AW_ADDR_LO);
+		rd_wr_flag = HL_RAZWI_WRITE;
 
 		/* Clear set indication */
 		WREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AW_SET, 0x1);
 	} else {
 		razwi_hi = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AR_ADDR_HI);
 		razwi_lo = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AR_ADDR_LO);
+		rd_wr_flag = HL_RAZWI_READ;
 
 		/* Clear set indication */
 		WREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AR_SET, 0x1);
 	}
 
+	hl_capture_razwi(hdev, (u64)razwi_hi << 32 | razwi_lo, &engines[0], num_of_eng,
+				rd_wr_flag | HL_RAZWI_HBW);
 	dev_err_ratelimited(hdev->dev,
 		"RAZWI PSOC unmapped HBW %s error, rtr id %u, address %#llx\n",
 		is_write ? "WR" : "RD", rtr_id, (u64)razwi_hi << 32 | razwi_lo);
@@ -7480,20 +7694,27 @@ static void gaudi2_razwi_unmapped_addr_hbw_printf_info(struct hl_device *hdev, u
 static void gaudi2_razwi_unmapped_addr_lbw_printf_info(struct hl_device *hdev, u32 rtr_id,
 							u64 rtr_ctrl_base_addr, bool is_write)
 {
+	u16 engines[HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR], num_of_eng;
 	u32 razwi_addr;
+	u8 rd_wr_flag;
+
+	num_of_eng = gaudi2_get_razwi_initiators(rtr_id, &engines[0]);
 
 	if (is_write) {
 		razwi_addr = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_LBW_AW_ADDR);
+		rd_wr_flag = HL_RAZWI_WRITE;
 
 		/* Clear set indication */
 		WREG32(rtr_ctrl_base_addr + DEC_RAZWI_LBW_AW_SET, 0x1);
 	} else {
 		razwi_addr = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_LBW_AR_ADDR);
+		rd_wr_flag = HL_RAZWI_READ;
 
 		/* Clear set indication */
 		WREG32(rtr_ctrl_base_addr + DEC_RAZWI_LBW_AR_SET, 0x1);
 	}
 
+	hl_capture_razwi(hdev, razwi_addr, &engines[0], num_of_eng, rd_wr_flag | HL_RAZWI_LBW);
 	dev_err_ratelimited(hdev->dev,
 		"RAZWI PSOC unmapped LBW %s error, rtr id %u, address %#x\n",
 		is_write ? "WR" : "RD", rtr_id, razwi_addr);
@@ -7974,28 +8195,28 @@ static void gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(struct hl_device *hdev)
 	razwi_happened_addr = mstr_if_base_addr + RR_SHRD_HBW_AW_RAZWI_HAPPENED;
 	if (RREG32(razwi_happened_addr)) {
 		gaudi2_razwi_rr_hbw_shared_printf_info(hdev, mstr_if_base_addr, true, "PCIE", true,
-							NULL);
+							NULL, GAUDI2_ENGINE_ID_PCIE);
 		WREG32(razwi_happened_addr, 0x1);
 	}
 
 	razwi_happened_addr = mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_HAPPENED;
 	if (RREG32(razwi_happened_addr)) {
 		gaudi2_razwi_rr_hbw_shared_printf_info(hdev, mstr_if_base_addr, false, "PCIE", true,
-							NULL);
+							NULL, GAUDI2_ENGINE_ID_PCIE);
 		WREG32(razwi_happened_addr, 0x1);
 	}
 
 	razwi_happened_addr = mstr_if_base_addr + RR_SHRD_LBW_AW_RAZWI_HAPPENED;
 	if (RREG32(razwi_happened_addr)) {
 		gaudi2_razwi_rr_lbw_shared_printf_info(hdev, mstr_if_base_addr, true, "PCIE", true,
-							NULL);
+							NULL, GAUDI2_ENGINE_ID_PCIE);
 		WREG32(razwi_happened_addr, 0x1);
 	}
 
 	razwi_happened_addr = mstr_if_base_addr + RR_SHRD_LBW_AR_RAZWI_HAPPENED;
 	if (RREG32(razwi_happened_addr)) {
 		gaudi2_razwi_rr_lbw_shared_printf_info(hdev, mstr_if_base_addr, false, "PCIE", true,
-							NULL);
+							NULL, GAUDI2_ENGINE_ID_PCIE);
 		WREG32(razwi_happened_addr, 0x1);
 	}
 }
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 2b794f54e2ed..a4ceee681898 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -597,6 +597,10 @@ enum gaudi2_engine_id {
 	GAUDI2_ENGINE_ID_NIC10_1,
 	GAUDI2_ENGINE_ID_NIC11_0,
 	GAUDI2_ENGINE_ID_NIC11_1,
+	GAUDI2_ENGINE_ID_PCIE,
+	GAUDI2_ENGINE_ID_PSOC,
+	GAUDI2_ENGINE_ID_ARC_FARM,
+	GAUDI2_ENGINE_ID_KDMA,
 	GAUDI2_ENGINE_ID_SIZE
 };
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 12/13] habanalabs/gaudi2: capture page fault data
  2022-10-06  8:22 [PATCH 01/13] habanalabs: use lower_32_bits() Oded Gabbay
                   ` (9 preceding siblings ...)
  2022-10-06  8:23 ` [PATCH 11/13] habanalabs/gaudi2: capture RAZWI information Oded Gabbay
@ 2022-10-06  8:23 ` Oded Gabbay
  2022-10-06  8:23 ` [PATCH 13/13] habanalabs: verify no zero event is sent Oded Gabbay
  11 siblings, 0 replies; 13+ messages in thread
From: Oded Gabbay @ 2022-10-06  8:23 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

Capture page fault data when it happens.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 13a5356f1ec3..9dc1a328a4bd 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -8286,6 +8286,7 @@ static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool
 
 	dev_err_ratelimited(hdev->dev, "%s page fault on va 0x%llx\n",
 				is_pmmu ? "PMMU" : "HMMU", addr);
+	hl_capture_page_fault(hdev, addr, 0, is_pmmu);
 
 	WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_ERROR_CAPTURE), 0);
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 13/13] habanalabs: verify no zero event is sent
  2022-10-06  8:22 [PATCH 01/13] habanalabs: use lower_32_bits() Oded Gabbay
                   ` (10 preceding siblings ...)
  2022-10-06  8:23 ` [PATCH 12/13] habanalabs/gaudi2: capture page fault data Oded Gabbay
@ 2022-10-06  8:23 ` Oded Gabbay
  11 siblings, 0 replies; 13+ messages in thread
From: Oded Gabbay @ 2022-10-06  8:23 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tal Cohen

From: Tal Cohen <talcohen@habana.ai>

The event notifier mechanism should not raise an empty
event (event equals zero).

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 5 +++++
 drivers/misc/habanalabs/gaudi/gaudi.c   | 4 +++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index b8b32285720d..9b54d1df5302 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -1746,6 +1746,11 @@ void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask)
 {
 	struct hl_fpriv	*hpriv;
 
+	if (!event_mask) {
+		dev_warn(hdev->dev, "Skip sending zero event");
+		return;
+	}
+
 	mutex_lock(&hdev->fpriv_list_lock);
 
 	list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node)
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 1a99f7be8b60..337123f73501 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7945,7 +7945,9 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
 	/* despite reset doesn't execute. a notification on
 	 * occurred event needs to be sent here
 	 */
-	hl_notifier_event_send_all(hdev, event_mask);
+	if (event_mask)
+		hl_notifier_event_send_all(hdev, event_mask);
+
 	if (reset_required)
 		hl_device_reset(hdev, flags);
 	else
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-10-06  8:24 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-06  8:22 [PATCH 01/13] habanalabs: use lower_32_bits() Oded Gabbay
2022-10-06  8:22 ` [PATCH 02/13] habanalabs/gaudi2: fix module ID for RAZWI handling Oded Gabbay
2022-10-06  8:22 ` [PATCH 03/13] habanalabs: add page fault info uapi Oded Gabbay
2022-10-06  8:22 ` [PATCH 04/13] habanalabs: replace 'pf' to 'prefetch' Oded Gabbay
2022-10-06  8:23 ` [PATCH 05/13] habanalabs/gaudi2: remove privileged MME clock configuration Oded Gabbay
2022-10-06  8:23 ` [PATCH 06/13] habanalabs/gaudi2: add device unavailable notification Oded Gabbay
2022-10-06  8:23 ` [PATCH 07/13] habanalabs: skip idle status check if reset on device release Oded Gabbay
2022-10-06  8:23 ` [PATCH 08/13] habanalabs: allow unregistering eventfd when device non-operational Oded Gabbay
2022-10-06  8:23 ` [PATCH 09/13] habanalabs: move reset workqueue to be under hl_device Oded Gabbay
2022-10-06  8:23 ` [PATCH 10/13] habanalabs: handle HBM MMU when capturing page fault data Oded Gabbay
2022-10-06  8:23 ` [PATCH 11/13] habanalabs/gaudi2: capture RAZWI information Oded Gabbay
2022-10-06  8:23 ` [PATCH 12/13] habanalabs/gaudi2: capture page fault data Oded Gabbay
2022-10-06  8:23 ` [PATCH 13/13] habanalabs: verify no zero event is sent Oded Gabbay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).