All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info
@ 2022-06-20 13:04 Oded Gabbay
  2022-06-20 13:04 ` [PATCH 02/17] habanalabs: expose undefined opcode status via info ioctl Oded Gabbay
                   ` (15 more replies)
  0 siblings, 16 replies; 17+ messages in thread
From: Oded Gabbay @ 2022-06-20 13:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tal Cohen

From: Tal Cohen <talcohen@habana.ai>

when an undefined opcode error occurres, the driver collects
the relevant information from the Qman and stores it inside
the hdev data structure. An event fd indication is sent towards the
user space.

Note: another commit shall be followed which will add support to
read the error info by an ioctl.

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c       |  13 ++-
 drivers/misc/habanalabs/common/habanalabs.h   |  40 ++++++-
 .../misc/habanalabs/common/habanalabs_drv.c   |   1 +
 drivers/misc/habanalabs/gaudi/gaudi.c         | 108 ++++++++++++++----
 include/uapi/misc/habanalabs.h                |   8 +-
 5 files changed, 138 insertions(+), 32 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 38e1ad432e51..0f804ecb6caa 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -1531,10 +1531,11 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	return rc;
 }
 
-static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event)
+static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event_mask)
 {
 	mutex_lock(&notifier_event->lock);
-	notifier_event->events_mask |= event;
+	notifier_event->events_mask |= event_mask;
+
 	if (notifier_event->eventfd)
 		eventfd_signal(notifier_event->eventfd, 1);
 
@@ -1545,17 +1546,17 @@ static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64
  * hl_notifier_event_send_all - notify all user processes via eventfd
  *
  * @hdev: pointer to habanalabs device structure
- * @event: the occurred event
+ * @event_mask: the occurred event/s
  * Returns 0 for success or an error on failure.
  */
-void hl_notifier_event_send_all(struct hl_device *hdev, u64 event)
+void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask)
 {
 	struct hl_fpriv	*hpriv;
 
 	mutex_lock(&hdev->fpriv_list_lock);
 
 	list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node)
-		hl_notifier_event_send(&hpriv->notifier_event, event);
+		hl_notifier_event_send(&hpriv->notifier_event, event_mask);
 
 	mutex_unlock(&hdev->fpriv_list_lock);
 
@@ -1563,7 +1564,7 @@ void hl_notifier_event_send_all(struct hl_device *hdev, u64 event)
 	mutex_lock(&hdev->fpriv_ctrl_list_lock);
 
 	list_for_each_entry(hpriv, &hdev->fpriv_ctrl_list, dev_node)
-		hl_notifier_event_send(&hpriv->notifier_event, event);
+		hl_notifier_event_send(&hpriv->notifier_event, event_mask);
 
 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
 }
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 1ab64e8a05c6..3a0f6dca8361 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2644,14 +2644,48 @@ struct razwi_info {
 	u8		type;
 };
 
+#define MAX_QMAN_STREAMS_INFO		4
+#define OPCODE_INFO_MAX_ADDR_SIZE	8
+/**
+ * struct undefined_opcode_info - info about last undefined opcode error
+ * @timestamp: timestamp of the undefined opcode error
+ * @cb_addr_streams: CB addresses (per stream) that are currently exists in the PQ
+ *                   entiers. In case all streams array entries are
+ *                   filled with values, it means the execution was in Lower-CP.
+ * @cq_addr: the address of the current handled command buffer
+ * @cq_size: the size of the current handled command buffer
+ * @cb_addr_streams_len: num of streams - actual len of cb_addr_streams array.
+ *                       should be equal to 1 incase of undefined opcode
+ *                       in Upper-CP (specific stream) and equal to 4 incase
+ *                       of undefined opcode in Lower-CP.
+ * @engine_id: engine-id that the error occurred on
+ * @stream_id: the stream id the error occurred on. In case the stream equals to
+ *             MAX_QMAN_STREAMS_INFO it means the error occurred on a Lower-CP.
+ * @write_enable: if set, writing to undefined opcode parameters in the structure
+ *                 is enable so the first (root cause) undefined opcode will not be
+ *                 overwritten.
+ */
+struct undefined_opcode_info {
+	ktime_t timestamp;
+	u64 cb_addr_streams[MAX_QMAN_STREAMS_INFO][OPCODE_INFO_MAX_ADDR_SIZE];
+	u64 cq_addr;
+	u32 cq_size;
+	u32 cb_addr_streams_len;
+	u32 engine_id;
+	u32 stream_id;
+	bool write_enable;
+};
+
 /**
  * struct last_error_session_info - info about last session errors occurred.
  * @cs_timeout: CS timeout error last information.
  * @razwi: razwi last information.
+ * @undef_opcode: undefined opcode information
  */
 struct last_error_session_info {
-	struct	cs_timeout_info	cs_timeout;
-	struct	razwi_info	razwi;
+	struct cs_timeout_info		cs_timeout;
+	struct razwi_info		razwi;
+	struct undefined_opcode_info	undef_opcode;
 };
 
 /**
@@ -3159,7 +3193,7 @@ int hl_device_utilization(struct hl_device *hdev, u32 *utilization);
 int hl_build_hwmon_channel_info(struct hl_device *hdev,
 		struct cpucp_sensor *sensors_arr);
 
-void hl_notifier_event_send_all(struct hl_device *hdev, u64 event);
+void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask);
 
 int hl_sysfs_init(struct hl_device *hdev);
 void hl_sysfs_fini(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index e617cc394ff7..d02533666746 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -198,6 +198,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
 
 	atomic_set(&hdev->last_error.cs_timeout.write_enable, 1);
 	atomic_set(&hdev->last_error.razwi.write_enable, 1);
+	hdev->last_error.undef_opcode.write_enable = true;
 
 	hdev->open_counter++;
 	hdev->last_successful_open_jif = jiffies;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 72b0d145e853..ec9f0a93cbe2 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -443,6 +443,38 @@ static s64 gaudi_state_dump_specs_props[] = {
 	[SP_NUM_CORES] = 1,
 };
 
+static const int gaudi_queue_id_to_engine_id[] = {
+	[GAUDI_QUEUE_ID_DMA_0_0...GAUDI_QUEUE_ID_DMA_0_3] = GAUDI_ENGINE_ID_DMA_0,
+	[GAUDI_QUEUE_ID_DMA_1_0...GAUDI_QUEUE_ID_DMA_1_3] = GAUDI_ENGINE_ID_DMA_1,
+	[GAUDI_QUEUE_ID_CPU_PQ] = GAUDI_ENGINE_ID_SIZE,
+	[GAUDI_QUEUE_ID_DMA_2_0...GAUDI_QUEUE_ID_DMA_2_3] = GAUDI_ENGINE_ID_DMA_2,
+	[GAUDI_QUEUE_ID_DMA_3_0...GAUDI_QUEUE_ID_DMA_3_3] = GAUDI_ENGINE_ID_DMA_3,
+	[GAUDI_QUEUE_ID_DMA_4_0...GAUDI_QUEUE_ID_DMA_4_3] = GAUDI_ENGINE_ID_DMA_4,
+	[GAUDI_QUEUE_ID_DMA_5_0...GAUDI_QUEUE_ID_DMA_5_3] = GAUDI_ENGINE_ID_DMA_5,
+	[GAUDI_QUEUE_ID_DMA_6_0...GAUDI_QUEUE_ID_DMA_6_3] = GAUDI_ENGINE_ID_DMA_6,
+	[GAUDI_QUEUE_ID_DMA_7_0...GAUDI_QUEUE_ID_DMA_7_3] = GAUDI_ENGINE_ID_DMA_7,
+	[GAUDI_QUEUE_ID_MME_0_0...GAUDI_QUEUE_ID_MME_0_3] = GAUDI_ENGINE_ID_MME_0,
+	[GAUDI_QUEUE_ID_MME_1_0...GAUDI_QUEUE_ID_MME_1_3] = GAUDI_ENGINE_ID_MME_1,
+	[GAUDI_QUEUE_ID_TPC_0_0...GAUDI_QUEUE_ID_TPC_0_3] = GAUDI_ENGINE_ID_TPC_0,
+	[GAUDI_QUEUE_ID_TPC_1_0...GAUDI_QUEUE_ID_TPC_1_3] = GAUDI_ENGINE_ID_TPC_1,
+	[GAUDI_QUEUE_ID_TPC_2_0...GAUDI_QUEUE_ID_TPC_2_3] = GAUDI_ENGINE_ID_TPC_2,
+	[GAUDI_QUEUE_ID_TPC_3_0...GAUDI_QUEUE_ID_TPC_3_3] = GAUDI_ENGINE_ID_TPC_3,
+	[GAUDI_QUEUE_ID_TPC_4_0...GAUDI_QUEUE_ID_TPC_4_3] = GAUDI_ENGINE_ID_TPC_4,
+	[GAUDI_QUEUE_ID_TPC_5_0...GAUDI_QUEUE_ID_TPC_5_3] = GAUDI_ENGINE_ID_TPC_5,
+	[GAUDI_QUEUE_ID_TPC_6_0...GAUDI_QUEUE_ID_TPC_6_3] = GAUDI_ENGINE_ID_TPC_6,
+	[GAUDI_QUEUE_ID_TPC_7_0...GAUDI_QUEUE_ID_TPC_7_3] = GAUDI_ENGINE_ID_TPC_7,
+	[GAUDI_QUEUE_ID_NIC_0_0...GAUDI_QUEUE_ID_NIC_0_3] = GAUDI_ENGINE_ID_NIC_0,
+	[GAUDI_QUEUE_ID_NIC_1_0...GAUDI_QUEUE_ID_NIC_1_3] = GAUDI_ENGINE_ID_NIC_1,
+	[GAUDI_QUEUE_ID_NIC_2_0...GAUDI_QUEUE_ID_NIC_2_3] = GAUDI_ENGINE_ID_NIC_2,
+	[GAUDI_QUEUE_ID_NIC_3_0...GAUDI_QUEUE_ID_NIC_3_3] = GAUDI_ENGINE_ID_NIC_3,
+	[GAUDI_QUEUE_ID_NIC_4_0...GAUDI_QUEUE_ID_NIC_4_3] = GAUDI_ENGINE_ID_NIC_4,
+	[GAUDI_QUEUE_ID_NIC_5_0...GAUDI_QUEUE_ID_NIC_5_3] = GAUDI_ENGINE_ID_NIC_5,
+	[GAUDI_QUEUE_ID_NIC_6_0...GAUDI_QUEUE_ID_NIC_6_3] = GAUDI_ENGINE_ID_NIC_6,
+	[GAUDI_QUEUE_ID_NIC_7_0...GAUDI_QUEUE_ID_NIC_7_3] = GAUDI_ENGINE_ID_NIC_7,
+	[GAUDI_QUEUE_ID_NIC_8_0...GAUDI_QUEUE_ID_NIC_8_3] = GAUDI_ENGINE_ID_NIC_8,
+	[GAUDI_QUEUE_ID_NIC_9_0...GAUDI_QUEUE_ID_NIC_9_3] = GAUDI_ENGINE_ID_NIC_9,
+};
+
 /* The order here is opposite to the order of the indexing in the h/w.
  * i.e. SYNC_MGR_W_S is actually 0, SYNC_MGR_E_S is 1, etc.
  */
@@ -6989,14 +7021,15 @@ static inline u32 gaudi_queue_idx_dec(u32 idx, u32 q_len)
 }
 
 /**
- * gaudi_print_sw_config_stream_data - print SW config stream data
+ * gaudi_handle_sw_config_stream_data - print SW config stream data
  *
  * @hdev: pointer to the habanalabs device structure
  * @stream: the QMAN's stream
  * @qman_base: base address of QMAN registers block
+ * @event_mask: mask of the last events occurred
  */
-static void gaudi_print_sw_config_stream_data(struct hl_device *hdev, u32 stream,
-						u64 qman_base)
+static void gaudi_handle_sw_config_stream_data(struct hl_device *hdev, u32 stream,
+						u64 qman_base, u64 event_mask)
 {
 	u64 cq_ptr_lo, cq_ptr_hi, cq_tsize, cq_ptr;
 	u32 cq_ptr_lo_off, size;
@@ -7014,24 +7047,32 @@ static void gaudi_print_sw_config_stream_data(struct hl_device *hdev, u32 stream
 	size = RREG32(cq_tsize);
 	dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %u\n",
 							stream, cq_ptr, size);
+
+	if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
+		hdev->last_error.undef_opcode.cq_addr = cq_ptr;
+		hdev->last_error.undef_opcode.cq_size = size;
+		hdev->last_error.undef_opcode.stream_id = stream;
+	}
 }
 
 /**
- * gaudi_print_last_pqes_on_err - print last PQEs on error
+ * gaudi_handle_last_pqes_on_err - print last PQEs on error
  *
  * @hdev: pointer to the habanalabs device structure
  * @qid_base: first QID of the QMAN (out of 4 streams)
  * @stream: the QMAN's stream
  * @qman_base: base address of QMAN registers block
+ * @event_mask: mask of the last events occurred
  * @pr_sw_conf: if true print the SW config stream data (CQ PTR and SIZE)
  */
-static void gaudi_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base,
+static void gaudi_handle_last_pqes_on_err(struct hl_device *hdev, u32 qid_base,
 						u32 stream, u64 qman_base,
+						u64 event_mask,
 						bool pr_sw_conf)
 {
 	u32 ci, qm_ci_stream_off, queue_len;
 	struct hl_hw_queue *q;
-	u64 pq_ci;
+	u64 pq_ci, addr[PQ_FETCHER_CACHE_SIZE];
 	int i;
 
 	q = &hdev->kernel_queues[qid_base + stream];
@@ -7046,16 +7087,16 @@ static void gaudi_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base,
 	hdev->asic_funcs->hw_queues_lock(hdev);
 
 	if (pr_sw_conf)
-		gaudi_print_sw_config_stream_data(hdev, stream, qman_base);
+		gaudi_handle_sw_config_stream_data(hdev, stream, qman_base, event_mask);
 
 	ci = RREG32(pq_ci);
 
 	/* we should start printing form ci -1 */
 	ci = gaudi_queue_idx_dec(ci, queue_len);
+	memset(addr, 0, sizeof(addr));
 
 	for (i = 0; i < PQ_FETCHER_CACHE_SIZE; i++) {
 		struct hl_bd *bd;
-		u64 addr;
 		u32 len;
 
 		bd = q->kernel_address;
@@ -7066,52 +7107,68 @@ static void gaudi_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base,
 		if (!len)
 			break;
 
-		addr = le64_to_cpu(bd->ptr);
+		addr[i] = le64_to_cpu(bd->ptr);
 
 		dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %u\n",
-							stream, ci, addr, len);
+							stream, ci, addr[i], len);
 
 		/* get previous ci, wrap if needed */
 		ci = gaudi_queue_idx_dec(ci, queue_len);
 	}
 
+	if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
+		struct undefined_opcode_info *undef_opcode = &hdev->last_error.undef_opcode;
+		u32 arr_idx = undef_opcode->cb_addr_streams_len;
+
+		if (arr_idx == 0) {
+			undef_opcode->timestamp = ktime_get();
+			undef_opcode->engine_id = gaudi_queue_id_to_engine_id[qid_base];
+		}
+
+		memcpy(undef_opcode->cb_addr_streams[arr_idx], addr, sizeof(addr));
+		undef_opcode->cb_addr_streams_len++;
+	}
+
 	hdev->asic_funcs->hw_queues_unlock(hdev);
 }
 
 /**
- * print_qman_data_on_err - extract QMAN data on error
+ * handle_qman_data_on_err - extract QMAN data on error
  *
  * @hdev: pointer to the habanalabs device structure
  * @qid_base: first QID of the QMAN (out of 4 streams)
  * @stream: the QMAN's stream
  * @qman_base: base address of QMAN registers block
+ * @event_mask: mask of the last events occurred
  *
  * This function attempt to exatract as much data as possible on QMAN error.
  * On upper CP print the SW config stream data and last 8 PQEs.
  * On lower CP print SW config data and last PQEs of ALL 4 upper CPs
  */
-static void print_qman_data_on_err(struct hl_device *hdev, u32 qid_base,
-						u32 stream, u64 qman_base)
+static void handle_qman_data_on_err(struct hl_device *hdev, u32 qid_base,
+				   u32 stream, u64 qman_base, u64 event_mask)
 {
 	u32 i;
 
 	if (stream != QMAN_STREAMS) {
-		gaudi_print_last_pqes_on_err(hdev, qid_base, stream, qman_base,
-									true);
+		gaudi_handle_last_pqes_on_err(hdev, qid_base, stream,
+			qman_base, event_mask, true);
 		return;
 	}
 
-	gaudi_print_sw_config_stream_data(hdev, stream, qman_base);
+	/* handle Lower-CP */
+	gaudi_handle_sw_config_stream_data(hdev, stream, qman_base, event_mask);
 
 	for (i = 0; i < QMAN_STREAMS; i++)
-		gaudi_print_last_pqes_on_err(hdev, qid_base, i, qman_base,
-									false);
+		gaudi_handle_last_pqes_on_err(hdev, qid_base, i,
+			qman_base, event_mask, false);
 }
 
 static void gaudi_handle_qman_err_generic(struct hl_device *hdev,
 					  const char *qm_name,
 					  u64 qman_base,
-					  u32 qid_base)
+					  u32 qid_base,
+					  u64 *event_mask)
 {
 	u32 i, j, glbl_sts_val, arb_err_val, glbl_sts_clr_val;
 	u64 glbl_sts_addr, arb_err_addr;
@@ -7142,12 +7199,21 @@ static void gaudi_handle_qman_err_generic(struct hl_device *hdev,
 				glbl_sts_clr_val |= BIT(j);
 			}
 		}
+		/* check for undefined opcode */
+		if (glbl_sts_val & TPC0_QM_GLBL_STS1_CP_UNDEF_CMD_ERR_MASK &&
+				hdev->last_error.undef_opcode.write_enable) {
+			memset(&hdev->last_error.undef_opcode, 0,
+						sizeof(hdev->last_error.undef_opcode));
+
+			hdev->last_error.undef_opcode.write_enable = false;
+			*event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
+		}
 
 		/* Write 1 clear errors */
 		if (!hdev->stop_on_err)
 			WREG32(glbl_sts_addr + 4 * i, glbl_sts_clr_val);
 		else
-			print_qman_data_on_err(hdev, qid_base, i, qman_base);
+			handle_qman_data_on_err(hdev, qid_base, i, qman_base, *event_mask);
 	}
 
 	arb_err_val = RREG32(arb_err_addr);
@@ -7385,7 +7451,7 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e
 		return;
 	}
 
-	gaudi_handle_qman_err_generic(hdev, desc, qman_base, qid_base);
+	gaudi_handle_qman_err_generic(hdev, desc, qman_base, qid_base, event_mask);
 }
 
 static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 6d2ccc09dcf2..c94b89cf1ec1 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -1402,9 +1402,13 @@ struct hl_debug_args {
 /*
  * Notifier event values - for the notification mechanism and the HL_INFO_GET_EVENTS command
  *
- * HL_NOTIFIER_EVENT_TPC_ASSERT - Indicates TPC assert event
+ * HL_NOTIFIER_EVENT_TPC_ASSERT       - Indicates TPC assert event
+ * HL_NOTIFIER_EVENT_UNDEFINED_OPCODE - Indicates undefined operation code
+ * HL_NOTIFIER_EVENT_DEVICE_RESET     - Indicates device requires a reset
  */
-#define HL_NOTIFIER_EVENT_TPC_ASSERT  (1 << 0)
+#define HL_NOTIFIER_EVENT_TPC_ASSERT		(1ULL << 0)
+#define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE	(1ULL << 1)
+#define HL_NOTIFIER_EVENT_DEVICE_RESET		(1ULL << 2)
 
 /*
  * Various information operations such as:
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 02/17] habanalabs: expose undefined opcode status via info ioctl
  2022-06-20 13:04 [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info Oded Gabbay
@ 2022-06-20 13:04 ` Oded Gabbay
  2022-06-20 13:04 ` [PATCH 03/17] habanalabs/gaudi: invoke device reset from one code block Oded Gabbay
                   ` (14 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: Oded Gabbay @ 2022-06-20 13:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tal Cohen

From: Tal Cohen <talcohen@habana.ai>

The info ioctl retrieves information on the last undefined opcode
occurred.

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../misc/habanalabs/common/habanalabs_ioctl.c | 25 ++++++++++++++++
 include/uapi/misc/habanalabs.h                | 30 +++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index c7864d6bb0a1..fe7ed46cd1c5 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -610,6 +610,28 @@ static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
 }
 
+static int undefined_opcode_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	u32 max_size = args->return_size;
+	struct hl_info_undefined_opcode_event info = {0};
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	info.timestamp = ktime_to_ns(hdev->last_error.undef_opcode.timestamp);
+	info.engine_id = hdev->last_error.undef_opcode.engine_id;
+	info.cq_addr = hdev->last_error.undef_opcode.cq_addr;
+	info.cq_size = hdev->last_error.undef_opcode.cq_size;
+	info.stream_id = hdev->last_error.undef_opcode.stream_id;
+	info.cb_addr_streams_len = hdev->last_error.undef_opcode.cb_addr_streams_len;
+	memcpy(info.cb_addr_streams, hdev->last_error.undef_opcode.cb_addr_streams,
+			sizeof(info.cb_addr_streams));
+
+	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
+}
+
 static int dev_mem_alloc_page_sizes_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 {
 	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
@@ -718,6 +740,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_RAZWI_EVENT:
 		return razwi_info(hpriv, args);
 
+	case HL_INFO_UNDEFINED_OPCODE_EVENT:
+		return undefined_opcode_info(hpriv, args);
+
 	case HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES:
 		return dev_mem_alloc_page_sizes_info(hpriv, args);
 
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index c94b89cf1ec1..5f9a6097f5f3 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -352,6 +352,7 @@ enum hl_server_type {
  * HL_INFO_REGISTER_EVENTFD   - Register eventfd for event notifications.
  * HL_INFO_UNREGISTER_EVENTFD - Unregister eventfd
  * HL_INFO_GET_EVENTS         - Retrieve the last occurred events
+ * HL_INFO_UNDEFINED_OPCODE_EVENT - Retrieve last undefined opcode error information.
  */
 #define HL_INFO_HW_IP_INFO			0
 #define HL_INFO_HW_EVENTS			1
@@ -380,6 +381,7 @@ enum hl_server_type {
 #define HL_INFO_REGISTER_EVENTFD		28
 #define HL_INFO_UNREGISTER_EVENTFD		29
 #define HL_INFO_GET_EVENTS			30
+#define HL_INFO_UNDEFINED_OPCODE_EVENT		31
 
 #define HL_INFO_VERSION_MAX_LEN			128
 #define HL_INFO_CARD_NAME_MAX_LEN		16
@@ -656,6 +658,34 @@ struct hl_info_razwi_event {
 	__u8 pad[2];
 };
 
+#define MAX_QMAN_STREAMS_INFO		4
+#define OPCODE_INFO_MAX_ADDR_SIZE	8
+/**
+ * struct hl_info_undefined_opcode_event - info about last undefined opcode error
+ * @timestamp: timestamp of the undefined opcode error
+ * @cb_addr_streams: CB addresses (per stream) that are currently exists in the PQ
+ *                   entiers. In case all streams array entries are
+ *                   filled with values, it means the execution was in Lower-CP.
+ * @cq_addr: the address of the current handled command buffer
+ * @cq_size: the size of the current handled command buffer
+ * @cb_addr_streams_len: num of streams - actual len of cb_addr_streams array.
+ *                       should be equal to 1 incase of undefined opcode
+ *                       in Upper-CP (specific stream) and equal to 4 incase
+ *                       of undefined opcode in Lower-CP.
+ * @engine_id: engine-id that the error occurred on
+ * @stream_id: the stream id the error occurred on. In case the stream equals to
+ *             MAX_QMAN_STREAMS_INFO it means the error occurred on a Lower-CP.
+ */
+struct hl_info_undefined_opcode_event {
+	__s64 timestamp;
+	__u64 cb_addr_streams[MAX_QMAN_STREAMS_INFO][OPCODE_INFO_MAX_ADDR_SIZE];
+	__u64 cq_addr;
+	__u32 cq_size;
+	__u32 cb_addr_streams_len;
+	__u32 engine_id;
+	__u32 stream_id;
+};
+
 /**
  * struct hl_info_dev_memalloc_page_sizes - valid page sizes in device mem alloc information.
  * @page_order_bitmask: bitmap in which a set bit represents the order of the supported page size
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 03/17] habanalabs/gaudi: invoke device reset from one code block
  2022-06-20 13:04 [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info Oded Gabbay
  2022-06-20 13:04 ` [PATCH 02/17] habanalabs: expose undefined opcode status via info ioctl Oded Gabbay
@ 2022-06-20 13:04 ` Oded Gabbay
  2022-06-20 13:04 ` [PATCH 04/17] habanalabs/gaudi: send device reset notification Oded Gabbay
                   ` (13 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: Oded Gabbay @ 2022-06-20 13:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tal Cohen

From: Tal Cohen <talcohen@habana.ai>

In order to prepare the driver code for device reset event
notification, change the event handler function flow to call
device reset from one code block.

In addition, the commit fixes an issue that reset was performed
w/o checking the 'hard_reset_on_fw_event' state and w/o setting
the HL_DRV_RESET_DELAY flag.

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index ec9f0a93cbe2..8f37297b2c3b 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7795,10 +7795,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 	struct gaudi_device *gaudi = hdev->asic_specific;
 	u64 data = le64_to_cpu(eq_entry->data[0]), event_mask = 0;
 	u32 ctl = le32_to_cpu(eq_entry->hdr.ctl);
-	u32 fw_fatal_err_flag = 0;
+	u32 fw_fatal_err_flag = 0, flags = 0;
 	u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK)
 			>> EQ_CTL_EVENT_TYPE_SHIFT);
-	bool reset_required;
+	bool reset_required, reset_direct = false;
 	u8 cause;
 	int rc;
 
@@ -7886,7 +7886,8 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 			dev_err(hdev->dev, "reset required due to %s\n",
 				gaudi_irq_map_table[event_type].name);
 
-			hl_device_reset(hdev, 0);
+			reset_direct = true;
+			goto reset_device;
 		} else {
 			hl_fw_unmask_irq(hdev, event_type);
 		}
@@ -7908,7 +7909,8 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 			dev_err(hdev->dev, "reset required due to %s\n",
 				gaudi_irq_map_table[event_type].name);
 
-			hl_device_reset(hdev, 0);
+			reset_direct = true;
+			goto reset_device;
 		} else {
 			hl_fw_unmask_irq(hdev, event_type);
 		}
@@ -8050,12 +8052,17 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 	return;
 
 reset_device:
-	if (hdev->asic_prop.fw_security_enabled)
-		hl_device_reset(hdev, HL_DRV_RESET_HARD
-					| HL_DRV_RESET_BYPASS_REQ_TO_FW
-					| fw_fatal_err_flag);
+	reset_required = true;
+
+	if (hdev->asic_prop.fw_security_enabled && !reset_direct)
+		flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW | fw_fatal_err_flag;
 	else if (hdev->hard_reset_on_fw_events)
-		hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY | fw_fatal_err_flag);
+		flags = HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY | fw_fatal_err_flag;
+	else
+		reset_required = false;
+
+	if (reset_required)
+		hl_device_reset(hdev, flags);
 	else
 		hl_fw_unmask_irq(hdev, event_type);
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 04/17] habanalabs/gaudi: send device reset notification
  2022-06-20 13:04 [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info Oded Gabbay
  2022-06-20 13:04 ` [PATCH 02/17] habanalabs: expose undefined opcode status via info ioctl Oded Gabbay
  2022-06-20 13:04 ` [PATCH 03/17] habanalabs/gaudi: invoke device reset from one code block Oded Gabbay
@ 2022-06-20 13:04 ` Oded Gabbay
  2022-06-20 13:04 ` [PATCH 05/17] habanalabs: send an event notification when CS timeout occurs Oded Gabbay
                   ` (12 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: Oded Gabbay @ 2022-06-20 13:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tal Cohen

From: Tal Cohen <talcohen@habana.ai>

Device reset event, indicates that the device shall be reset -
after a short delay. In such case, the driver sends a notification
towards the User process. This allows the User process
to be able to take several debug actions for system
diagnostic purposes.

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 8f37297b2c3b..b7460c30aa51 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -8054,13 +8054,20 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 reset_device:
 	reset_required = true;
 
-	if (hdev->asic_prop.fw_security_enabled && !reset_direct)
+	if (hdev->asic_prop.fw_security_enabled && !reset_direct) {
 		flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW | fw_fatal_err_flag;
-	else if (hdev->hard_reset_on_fw_events)
+		event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
+	} else if (hdev->hard_reset_on_fw_events) {
 		flags = HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY | fw_fatal_err_flag;
-	else
+		event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
+	} else {
 		reset_required = false;
+	}
 
+	/* despite reset doesn't execute. a notification on
+	 * occurred event needs to be sent here
+	 */
+	hl_notifier_event_send_all(hdev, event_mask);
 	if (reset_required)
 		hl_device_reset(hdev, flags);
 	else
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 05/17] habanalabs: send an event notification when CS timeout occurs
  2022-06-20 13:04 [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info Oded Gabbay
                   ` (2 preceding siblings ...)
  2022-06-20 13:04 ` [PATCH 04/17] habanalabs/gaudi: send device reset notification Oded Gabbay
@ 2022-06-20 13:04 ` Oded Gabbay
  2022-06-20 13:04 ` [PATCH 06/17] habanalabs: avoid unnecessary error print Oded Gabbay
                   ` (11 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: Oded Gabbay @ 2022-06-20 13:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tal Cohen

From: Tal Cohen <talcohen@habana.ai>

The Driver needs to inform the User process whenever one of its
CS is timed out. The Driver shall recognize the CS timeout and shall
send an eventfd notification, towards user space, whenever a timeout
is expired on a CS.

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 26 ++++++++++++-------
 include/uapi/misc/habanalabs.h                |  2 ++
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 47b49cbf67ab..cbb7c29966ff 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -797,10 +797,11 @@ static void cs_do_release(struct kref *ref)
 static void cs_timedout(struct work_struct *work)
 {
 	struct hl_device *hdev;
+	u64 event_mask;
 	int rc;
 	struct hl_cs *cs = container_of(work, struct hl_cs,
 						 work_tdr.work);
-	bool skip_reset_on_timeout = cs->skip_reset_on_timeout;
+	bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false;
 
 	rc = cs_get_unless_zero(cs);
 	if (!rc)
@@ -811,9 +812,15 @@ static void cs_timedout(struct work_struct *work)
 		return;
 	}
 
-	/* Mark the CS is timed out so we won't try to cancel its TDR */
-	if (likely(!skip_reset_on_timeout))
+	if (likely(!skip_reset_on_timeout)) {
+		if (hdev->reset_on_lockup)
+			device_reset = true;
+		else
+			hdev->reset_info.needs_reset = true;
+
+		/* Mark the CS is timed out so we won't try to cancel its TDR */
 		cs->timedout = true;
+	}
 
 	hdev = cs->ctx->hdev;
 
@@ -822,6 +829,11 @@ static void cs_timedout(struct work_struct *work)
 	if (rc) {
 		hdev->last_error.cs_timeout.timestamp = ktime_get();
 		hdev->last_error.cs_timeout.seq = cs->sequence;
+
+		event_mask = device_reset ? (HL_NOTIFIER_EVENT_CS_TIMEOUT |
+				HL_NOTIFIER_EVENT_DEVICE_RESET) : HL_NOTIFIER_EVENT_CS_TIMEOUT;
+
+		hl_notifier_event_send_all(hdev, event_mask);
 	}
 
 	switch (cs->type) {
@@ -856,12 +868,8 @@ static void cs_timedout(struct work_struct *work)
 
 	cs_put(cs);
 
-	if (likely(!skip_reset_on_timeout)) {
-		if (hdev->reset_on_lockup)
-			hl_device_reset(hdev, HL_DRV_RESET_TDR);
-		else
-			hdev->reset_info.needs_reset = true;
-	}
+	if (device_reset)
+		hl_device_reset(hdev, HL_DRV_RESET_TDR);
 }
 
 static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 5f9a6097f5f3..18f86d259421 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -1435,10 +1435,12 @@ struct hl_debug_args {
  * HL_NOTIFIER_EVENT_TPC_ASSERT       - Indicates TPC assert event
  * HL_NOTIFIER_EVENT_UNDEFINED_OPCODE - Indicates undefined operation code
  * HL_NOTIFIER_EVENT_DEVICE_RESET     - Indicates device requires a reset
+ * HL_NOTIFIER_EVENT_CS_TIMEOUT       - Indicates CS timeout error
  */
 #define HL_NOTIFIER_EVENT_TPC_ASSERT		(1ULL << 0)
 #define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE	(1ULL << 1)
 #define HL_NOTIFIER_EVENT_DEVICE_RESET		(1ULL << 2)
+#define HL_NOTIFIER_EVENT_CS_TIMEOUT            (1ULL << 3)
 
 /*
  * Various information operations such as:
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 06/17] habanalabs: avoid unnecessary error print
  2022-06-20 13:04 [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info Oded Gabbay
                   ` (3 preceding siblings ...)
  2022-06-20 13:04 ` [PATCH 05/17] habanalabs: send an event notification when CS timeout occurs Oded Gabbay
@ 2022-06-20 13:04 ` Oded Gabbay
  2022-06-20 13:04 ` [PATCH 07/17] habanalabs/gaudi: fix incorrect MME offset calculation Oded Gabbay
                   ` (10 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: Oded Gabbay @ 2022-06-20 13:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

When sending a packet to FW right after it made reset, we will get
packet timeout. Since it is expected behavior, we don't need to
print an error in such case.
Hence, when driver is in hard reset it will avoid from printing error
messages about packet timeout.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index 828a36af5b14..bd66e4f84156 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -267,7 +267,14 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 	hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);
 
 	if (rc == -ETIMEDOUT) {
-		dev_err(hdev->dev, "Device CPU packet timeout (0x%x)\n", tmp);
+		/* If FW performed reset just before sending it a packet, we will get a timeout.
+		 * This is expected behavior, hence no need for error message.
+		 */
+		if (!hl_device_operational(hdev, NULL) && !hdev->reset_info.is_in_soft_reset)
+			dev_dbg(hdev->dev, "Device CPU packet timeout (0x%x) due to FW reset\n",
+					tmp);
+		else
+			dev_err(hdev->dev, "Device CPU packet timeout (0x%x)\n", tmp);
 		hdev->device_cpu_disabled = true;
 		goto out;
 	}
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 07/17] habanalabs/gaudi: fix incorrect MME offset calculation
  2022-06-20 13:04 [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info Oded Gabbay
                   ` (4 preceding siblings ...)
  2022-06-20 13:04 ` [PATCH 06/17] habanalabs: avoid unnecessary error print Oded Gabbay
@ 2022-06-20 13:04 ` Oded Gabbay
  2022-06-20 13:04 ` [PATCH 08/17] habanalabs: add validity check for cq counter offset Oded Gabbay
                   ` (9 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: Oded Gabbay @ 2022-06-20 13:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: Koby Elbaz

From: Koby Elbaz <kelbaz@habana.ai>

Once FW raised an event following a MME2 QMAN error, the driver should
have gone to the corresponding status registers, trying to gather more
info on the error, yet it was accidentally accessing MME1 QMAN address
space.

Generally, we have x4 MMEs, while 0 & 2 are marked MASTER, and
1 & 3 are marked SLAVE. The former can be addressed, yet addressing
the latter is considered an access violation, and will result in a
hung system, which is what unintentionally happened above.
Note that this cannot happen in a secured system, since these registers
are protected with range registers.

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index b7460c30aa51..8b9ff7fa51ea 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -454,7 +454,7 @@ static const int gaudi_queue_id_to_engine_id[] = {
 	[GAUDI_QUEUE_ID_DMA_6_0...GAUDI_QUEUE_ID_DMA_6_3] = GAUDI_ENGINE_ID_DMA_6,
 	[GAUDI_QUEUE_ID_DMA_7_0...GAUDI_QUEUE_ID_DMA_7_3] = GAUDI_ENGINE_ID_DMA_7,
 	[GAUDI_QUEUE_ID_MME_0_0...GAUDI_QUEUE_ID_MME_0_3] = GAUDI_ENGINE_ID_MME_0,
-	[GAUDI_QUEUE_ID_MME_1_0...GAUDI_QUEUE_ID_MME_1_3] = GAUDI_ENGINE_ID_MME_1,
+	[GAUDI_QUEUE_ID_MME_1_0...GAUDI_QUEUE_ID_MME_1_3] = GAUDI_ENGINE_ID_MME_2,
 	[GAUDI_QUEUE_ID_TPC_0_0...GAUDI_QUEUE_ID_TPC_0_3] = GAUDI_ENGINE_ID_TPC_0,
 	[GAUDI_QUEUE_ID_TPC_1_0...GAUDI_QUEUE_ID_TPC_1_3] = GAUDI_ENGINE_ID_TPC_1,
 	[GAUDI_QUEUE_ID_TPC_2_0...GAUDI_QUEUE_ID_TPC_2_3] = GAUDI_ENGINE_ID_TPC_2,
@@ -7383,8 +7383,13 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e
 		snprintf(desc, ARRAY_SIZE(desc), "%s%d", "TPC_QM", index);
 		break;
 	case GAUDI_EVENT_MME0_QM ... GAUDI_EVENT_MME2_QM:
-		index = event_type - GAUDI_EVENT_MME0_QM;
-		qid_base = GAUDI_QUEUE_ID_MME_0_0 + index * QMAN_STREAMS;
+		if (event_type == GAUDI_EVENT_MME0_QM) {
+			index = 0;
+			qid_base = GAUDI_QUEUE_ID_MME_0_0;
+		} else if (event_type == GAUDI_EVENT_MME2_QM) {
+			index = 2;
+			qid_base = GAUDI_QUEUE_ID_MME_1_0;
+		}
 		qman_base = mmMME0_QM_BASE + index * MME_QMAN_OFFSET;
 		snprintf(desc, ARRAY_SIZE(desc), "%s%d", "MME_QM", index);
 		break;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 08/17] habanalabs: add validity check for cq counter offset
  2022-06-20 13:04 [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info Oded Gabbay
                   ` (5 preceding siblings ...)
  2022-06-20 13:04 ` [PATCH 07/17] habanalabs/gaudi: fix incorrect MME offset calculation Oded Gabbay
@ 2022-06-20 13:04 ` Oded Gabbay
  2022-06-20 13:04 ` [PATCH 09/17] habanalabs/gaudi: fix shift out of bounds Oded Gabbay
                   ` (8 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: Oded Gabbay @ 2022-06-20 13:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: farah kassabri

From: farah kassabri <fkassabri@habana.ai>

Driver performs no validity check for the user cq counter offset
used in both wait_for_interrupt and register_for_timestamp APIs.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_submission.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index cbb7c29966ff..72a557b83a7d 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -2979,7 +2979,7 @@ static int ts_buff_get_kernel_ts_record(struct hl_mmap_mem_buf *buf,
 	u64 current_cq_counter;
 
 	/* Validate ts_offset not exceeding last max */
-	if (requested_offset_record > cb_last) {
+	if (requested_offset_record >= cb_last) {
 		dev_err(buf->mmg->dev, "Ts offset exceeds max CB offset(0x%llx)\n",
 								(u64)(uintptr_t)cb_last);
 		return -EINVAL;
@@ -3064,6 +3064,13 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 		goto put_ctx;
 	}
 
+	/* Validate the cq offset */
+	if (((u64 *) cq_cb->kernel_address + cq_counters_offset) >=
+			((u64 *) cq_cb->kernel_address + (cq_cb->size / sizeof(u64)))) {
+		rc = -EINVAL;
+		goto put_cq_cb;
+	}
+
 	if (register_ts_record) {
 		dev_dbg(hdev->dev, "Timestamp registration: interrupt id: %u, ts offset: %llu, cq_offset: %llu\n",
 					interrupt->interrupt_id, ts_offset, cq_counters_offset);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 09/17] habanalabs/gaudi: fix shift out of bounds
  2022-06-20 13:04 [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info Oded Gabbay
                   ` (6 preceding siblings ...)
  2022-06-20 13:04 ` [PATCH 08/17] habanalabs: add validity check for cq counter offset Oded Gabbay
@ 2022-06-20 13:04 ` Oded Gabbay
  2022-06-20 13:04 ` [PATCH 10/17] habanalabs: fix NULL dereference on cs timeout Oded Gabbay
                   ` (7 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: Oded Gabbay @ 2022-06-20 13:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

When validating NIC queues, queue offset calculation must be
performed only for NIC queues.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 8b9ff7fa51ea..31e702846f7a 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -5692,15 +5692,17 @@ static int gaudi_parse_cb_no_ext_queue(struct hl_device *hdev,
 {
 	struct asic_fixed_properties *asic_prop = &hdev->asic_prop;
 	struct gaudi_device *gaudi = hdev->asic_specific;
-	u32 nic_mask_q_id = 1 << (HW_CAP_NIC_SHIFT +
-		((parser->hw_queue_id - GAUDI_QUEUE_ID_NIC_0_0) >> 2));
+	u32 nic_queue_offset, nic_mask_q_id;
 
 	if ((parser->hw_queue_id >= GAUDI_QUEUE_ID_NIC_0_0) &&
-			(parser->hw_queue_id <= GAUDI_QUEUE_ID_NIC_9_3) &&
-			(!(gaudi->hw_cap_initialized & nic_mask_q_id))) {
-		dev_err(hdev->dev, "h/w queue %d is disabled\n",
-				parser->hw_queue_id);
-		return -EINVAL;
+			(parser->hw_queue_id <= GAUDI_QUEUE_ID_NIC_9_3)) {
+		nic_queue_offset = parser->hw_queue_id - GAUDI_QUEUE_ID_NIC_0_0;
+		nic_mask_q_id = 1 << (HW_CAP_NIC_SHIFT + (nic_queue_offset >> 2));
+
+		if (!(gaudi->hw_cap_initialized & nic_mask_q_id)) {
+			dev_err(hdev->dev, "h/w queue %d is disabled\n", parser->hw_queue_id);
+			return -EINVAL;
+		}
 	}
 
 	/* For internal queue jobs just check if CB address is valid */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 10/17] habanalabs: fix NULL dereference on cs timeout
  2022-06-20 13:04 [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info Oded Gabbay
                   ` (7 preceding siblings ...)
  2022-06-20 13:04 ` [PATCH 09/17] habanalabs/gaudi: fix shift out of bounds Oded Gabbay
@ 2022-06-20 13:04 ` Oded Gabbay
  2022-06-20 13:04 ` [PATCH 11/17] habanalabs: remove unused get_dma_desc_list_size Oded Gabbay
                   ` (6 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: Oded Gabbay @ 2022-06-20 13:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: Yuri Nudelman

From: Yuri Nudelman <ynudelman@habana.ai>

Device descriptor is accessed before an assignment

Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_submission.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 72a557b83a7d..c050f38b7091 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -812,6 +812,8 @@ static void cs_timedout(struct work_struct *work)
 		return;
 	}
 
+	hdev = cs->ctx->hdev;
+
 	if (likely(!skip_reset_on_timeout)) {
 		if (hdev->reset_on_lockup)
 			device_reset = true;
@@ -822,8 +824,6 @@ static void cs_timedout(struct work_struct *work)
 		cs->timedout = true;
 	}
 
-	hdev = cs->ctx->hdev;
-
 	/* Save only the first CS timeout parameters */
 	rc = atomic_cmpxchg(&hdev->last_error.cs_timeout.write_enable, 1, 0);
 	if (rc) {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 11/17] habanalabs: remove unused get_dma_desc_list_size
  2022-06-20 13:04 [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info Oded Gabbay
                   ` (8 preceding siblings ...)
  2022-06-20 13:04 ` [PATCH 10/17] habanalabs: fix NULL dereference on cs timeout Oded Gabbay
@ 2022-06-20 13:04 ` Oded Gabbay
  2022-06-20 13:04 ` [PATCH 12/17] habanalabs/gaudi: notify user process on device unavailable Oded Gabbay
                   ` (5 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: Oded Gabbay @ 2022-06-20 13:04 UTC (permalink / raw)
  To: linux-kernel

This asic callback function is not called anymore from the common code.
The asic-specific function itself is called but from within the
asic-specific code.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs.h | 3 ---
 drivers/misc/habanalabs/gaudi/gaudi.c       | 1 -
 drivers/misc/habanalabs/goya/goya.c         | 1 -
 3 files changed, 5 deletions(-)

diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 3a0f6dca8361..94893305b928 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1260,7 +1260,6 @@ struct fw_load_mgr {
  * @hl_dma_unmap_sgtable: DMA unmap scatter-gather table.
  * @cs_parser: parse Command Submission.
  * @asic_dma_map_sgtable: DMA map scatter-gather table.
- * @get_dma_desc_list_size: get number of LIN_DMA packets required for CB.
  * @add_end_of_cb_packets: Add packets to the end of CB, if device requires it.
  * @update_eq_ci: update event queue CI.
  * @context_switch: called upon ASID context switch.
@@ -1379,8 +1378,6 @@ struct hl_asic_funcs {
 	int (*cs_parser)(struct hl_device *hdev, struct hl_cs_parser *parser);
 	int (*asic_dma_map_sgtable)(struct hl_device *hdev, struct sg_table *sgt,
 				enum dma_data_direction dir);
-	u32 (*get_dma_desc_list_size)(struct hl_device *hdev,
-					struct sg_table *sgt);
 	void (*add_end_of_cb_packets)(struct hl_device *hdev,
 					void *kernel_address, u32 len,
 					u64 cq_addr, u32 cq_val, u32 msix_num,
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 31e702846f7a..1156ec7dacc1 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -9315,7 +9315,6 @@ static const struct hl_asic_funcs gaudi_funcs = {
 	.hl_dma_unmap_sgtable = hl_dma_unmap_sgtable,
 	.cs_parser = gaudi_cs_parser,
 	.asic_dma_map_sgtable = hl_dma_map_sgtable,
-	.get_dma_desc_list_size = gaudi_get_dma_desc_list_size,
 	.add_end_of_cb_packets = gaudi_add_end_of_cb_packets,
 	.update_eq_ci = gaudi_update_eq_ci,
 	.context_switch = gaudi_context_switch,
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 4cde505a7416..64590fc55dc9 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5461,7 +5461,6 @@ static const struct hl_asic_funcs goya_funcs = {
 	.hl_dma_unmap_sgtable = hl_dma_unmap_sgtable,
 	.cs_parser = goya_cs_parser,
 	.asic_dma_map_sgtable = hl_dma_map_sgtable,
-	.get_dma_desc_list_size = goya_get_dma_desc_list_size,
 	.add_end_of_cb_packets = goya_add_end_of_cb_packets,
 	.update_eq_ci = goya_update_eq_ci,
 	.context_switch = goya_context_switch,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 12/17] habanalabs/gaudi: notify user process on device unavailable
  2022-06-20 13:04 [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info Oded Gabbay
                   ` (9 preceding siblings ...)
  2022-06-20 13:04 ` [PATCH 11/17] habanalabs: remove unused get_dma_desc_list_size Oded Gabbay
@ 2022-06-20 13:04 ` Oded Gabbay
  2022-06-20 13:04 ` [PATCH 13/17] habanalabs: add critical indication in sram ecc Oded Gabbay
                   ` (4 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: Oded Gabbay @ 2022-06-20 13:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tal Cohen

From: Tal Cohen <talcohen@habana.ai>

When a device error occurs, user process would like to get some
indication on the error by reading some device HW info. If the
device is unavailable, user process can't perform any HW device
reading.

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c |  5 ++++-
 include/uapi/misc/habanalabs.h        | 12 +++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 1156ec7dacc1..939d2636b9ed 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -8063,7 +8063,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 
 	if (hdev->asic_prop.fw_security_enabled && !reset_direct) {
 		flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW | fw_fatal_err_flag;
-		event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
+
+		/* notify on device unavailable while the reset triggered by fw */
+		event_mask |= (HL_NOTIFIER_EVENT_DEVICE_RESET |
+					HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE);
 	} else if (hdev->hard_reset_on_fw_events) {
 		flags = HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY | fw_fatal_err_flag;
 		event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 18f86d259421..78aecea4684d 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -1432,15 +1432,17 @@ struct hl_debug_args {
 /*
  * Notifier event values - for the notification mechanism and the HL_INFO_GET_EVENTS command
  *
- * HL_NOTIFIER_EVENT_TPC_ASSERT       - Indicates TPC assert event
- * HL_NOTIFIER_EVENT_UNDEFINED_OPCODE - Indicates undefined operation code
- * HL_NOTIFIER_EVENT_DEVICE_RESET     - Indicates device requires a reset
- * HL_NOTIFIER_EVENT_CS_TIMEOUT       - Indicates CS timeout error
+ * HL_NOTIFIER_EVENT_TPC_ASSERT		- Indicates TPC assert event
+ * HL_NOTIFIER_EVENT_UNDEFINED_OPCODE	- Indicates undefined operation code
+ * HL_NOTIFIER_EVENT_DEVICE_RESET	- Indicates device requires a reset
+ * HL_NOTIFIER_EVENT_CS_TIMEOUT		- Indicates CS timeout error
+ * HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE	- Indicates device is unavailable
  */
 #define HL_NOTIFIER_EVENT_TPC_ASSERT		(1ULL << 0)
 #define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE	(1ULL << 1)
 #define HL_NOTIFIER_EVENT_DEVICE_RESET		(1ULL << 2)
-#define HL_NOTIFIER_EVENT_CS_TIMEOUT            (1ULL << 3)
+#define HL_NOTIFIER_EVENT_CS_TIMEOUT		(1ULL << 3)
+#define HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE	(1ULL << 4)
 
 /*
  * Various information operations such as:
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 13/17] habanalabs: add critical indication in sram ecc
  2022-06-20 13:04 [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info Oded Gabbay
                   ` (10 preceding siblings ...)
  2022-06-20 13:04 ` [PATCH 12/17] habanalabs/gaudi: notify user process on device unavailable Oded Gabbay
@ 2022-06-20 13:04 ` Oded Gabbay
  2022-06-20 13:04 ` [PATCH 14/17] habanalabs: check fence pointer before use Oded Gabbay
                   ` (3 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: Oded Gabbay @ 2022-06-20 13:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: ran shalit

From: ran shalit <rshalit@habana.ai>

Multiple SRAM SERR events are treated as critical events,
and host should be notified about it. Thus, adding is_critical
indication as part of SRAM ECC failure packet.

Signed-off-by: ran shalit <rshalit@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/include/common/cpucp_if.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h
index 38e44b6cf581..b190a44ef2e2 100644
--- a/drivers/misc/habanalabs/include/common/cpucp_if.h
+++ b/drivers/misc/habanalabs/include/common/cpucp_if.h
@@ -68,7 +68,8 @@ struct hl_eq_ecc_data {
 	__le64 ecc_address;
 	__le64 ecc_syndrom;
 	__u8 memory_wrapper_idx;
-	__u8 pad[7];
+	__u8 is_critical;
+	__u8 pad[6];
 };
 
 enum hl_sm_sei_cause {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 14/17] habanalabs: check fence pointer before use
  2022-06-20 13:04 [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info Oded Gabbay
                   ` (11 preceding siblings ...)
  2022-06-20 13:04 ` [PATCH 13/17] habanalabs: add critical indication in sram ecc Oded Gabbay
@ 2022-06-20 13:04 ` Oded Gabbay
  2022-06-20 13:04 ` [PATCH 15/17] habanalabs: print pointer with correct modifier Oded Gabbay
                   ` (2 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: Oded Gabbay @ 2022-06-20 13:04 UTC (permalink / raw)
  To: linux-kernel

fence pointer can be NULL in this path, as shown by an earlier check.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_submission.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index c050f38b7091..e5549a9da42e 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -2581,7 +2581,7 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data, struct multi_cs_com
 			 * For this we have to validate that the timestamp is
 			 * earliest of all timestamps so far.
 			 */
-			if (mcs_data->update_ts &&
+			if (fence && mcs_data->update_ts &&
 					(ktime_compare(fence->timestamp, first_cs_time) < 0))
 				first_cs_time = fence->timestamp;
 			break;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 15/17] habanalabs: print pointer with correct modifier
  2022-06-20 13:04 [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info Oded Gabbay
                   ` (12 preceding siblings ...)
  2022-06-20 13:04 ` [PATCH 14/17] habanalabs: check fence pointer before use Oded Gabbay
@ 2022-06-20 13:04 ` Oded Gabbay
  2022-06-20 13:04 ` [PATCH 16/17] habanalabs: use kvcalloc when possible Oded Gabbay
  2022-06-20 13:04 ` [PATCH 17/17] habanalabs: fix comment style Oded Gabbay
  15 siblings, 0 replies; 17+ messages in thread
From: Oded Gabbay @ 2022-06-20 13:04 UTC (permalink / raw)
  To: linux-kernel

Use %p instead of %llx for printing pointers.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_submission.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index e5549a9da42e..5130a63e49cf 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -3035,8 +3035,8 @@ static int ts_buff_get_kernel_ts_record(struct hl_mmap_mem_buf *buf,
 
 	*pend = requested_offset_record;
 
-	dev_dbg(buf->mmg->dev, "Found available node in TS kernel CB(0x%llx)\n",
-						(u64)(uintptr_t)requested_offset_record);
+	dev_dbg(buf->mmg->dev, "Found available node in TS kernel CB %p\n",
+		requested_offset_record);
 	return 0;
 }
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 16/17] habanalabs: use kvcalloc when possible
  2022-06-20 13:04 [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info Oded Gabbay
                   ` (13 preceding siblings ...)
  2022-06-20 13:04 ` [PATCH 15/17] habanalabs: print pointer with correct modifier Oded Gabbay
@ 2022-06-20 13:04 ` Oded Gabbay
  2022-06-20 13:04 ` [PATCH 17/17] habanalabs: fix comment style Oded Gabbay
  15 siblings, 0 replies; 17+ messages in thread
From: Oded Gabbay @ 2022-06-20 13:04 UTC (permalink / raw)
  To: linux-kernel

kvcalloc is same as kvmalloc_array with GFP_ZERO.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/mmu/mmu_v1.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/common/mmu/mmu_v1.c b/drivers/misc/habanalabs/common/mmu/mmu_v1.c
index ad4b49281cd7..8a40de4a4761 100644
--- a/drivers/misc/habanalabs/common/mmu/mmu_v1.c
+++ b/drivers/misc/habanalabs/common/mmu/mmu_v1.c
@@ -393,9 +393,8 @@ static int hl_mmu_v1_init(struct hl_device *hdev)
 		goto err_pool_add;
 	}
 
-	hdev->mmu_priv.dr.mmu_shadow_hop0 = kvmalloc_array(prop->max_asid,
-						prop->mmu_hop_table_size,
-						GFP_KERNEL | __GFP_ZERO);
+	hdev->mmu_priv.dr.mmu_shadow_hop0 = kvcalloc(prop->max_asid, prop->mmu_hop_table_size,
+										GFP_KERNEL);
 	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) {
 		rc = -ENOMEM;
 		goto err_pool_add;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 17/17] habanalabs: fix comment style
  2022-06-20 13:04 [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info Oded Gabbay
                   ` (14 preceding siblings ...)
  2022-06-20 13:04 ` [PATCH 16/17] habanalabs: use kvcalloc when possible Oded Gabbay
@ 2022-06-20 13:04 ` Oded Gabbay
  15 siblings, 0 replies; 17+ messages in thread
From: Oded Gabbay @ 2022-06-20 13:04 UTC (permalink / raw)
  To: linux-kernel

function name should not be preceded with @

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/memory_mgr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/memory_mgr.c b/drivers/misc/habanalabs/common/memory_mgr.c
index ea5f2bd31b0a..56df962d2f3c 100644
--- a/drivers/misc/habanalabs/common/memory_mgr.c
+++ b/drivers/misc/habanalabs/common/memory_mgr.c
@@ -135,7 +135,7 @@ int hl_mmap_mem_buf_put_handle(struct hl_mem_mgr *mmg, u64 handle)
 }
 
 /**
- * @hl_mmap_mem_buf_alloc - allocate a new mappable buffer
+ * hl_mmap_mem_buf_alloc - allocate a new mappable buffer
  *
  * @mmg: parent unifed memory manager
  * @behavior: behavior object describing this buffer polymorphic behavior
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2022-06-20 14:10 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-20 13:04 [PATCH 01/17] habanalabs/gaudi: collect undefined opcode error info Oded Gabbay
2022-06-20 13:04 ` [PATCH 02/17] habanalabs: expose undefined opcode status via info ioctl Oded Gabbay
2022-06-20 13:04 ` [PATCH 03/17] habanalabs/gaudi: invoke device reset from one code block Oded Gabbay
2022-06-20 13:04 ` [PATCH 04/17] habanalabs/gaudi: send device reset notification Oded Gabbay
2022-06-20 13:04 ` [PATCH 05/17] habanalabs: send an event notification when CS timeout occurs Oded Gabbay
2022-06-20 13:04 ` [PATCH 06/17] habanalabs: avoid unnecessary error print Oded Gabbay
2022-06-20 13:04 ` [PATCH 07/17] habanalabs/gaudi: fix incorrect MME offset calculation Oded Gabbay
2022-06-20 13:04 ` [PATCH 08/17] habanalabs: add validity check for cq counter offset Oded Gabbay
2022-06-20 13:04 ` [PATCH 09/17] habanalabs/gaudi: fix shift out of bounds Oded Gabbay
2022-06-20 13:04 ` [PATCH 10/17] habanalabs: fix NULL dereference on cs timeout Oded Gabbay
2022-06-20 13:04 ` [PATCH 11/17] habanalabs: remove unused get_dma_desc_list_size Oded Gabbay
2022-06-20 13:04 ` [PATCH 12/17] habanalabs/gaudi: notify user process on device unavailable Oded Gabbay
2022-06-20 13:04 ` [PATCH 13/17] habanalabs: add critical indication in sram ecc Oded Gabbay
2022-06-20 13:04 ` [PATCH 14/17] habanalabs: check fence pointer before use Oded Gabbay
2022-06-20 13:04 ` [PATCH 15/17] habanalabs: print pointer with correct modifier Oded Gabbay
2022-06-20 13:04 ` [PATCH 16/17] habanalabs: use kvcalloc when possible Oded Gabbay
2022-06-20 13:04 ` [PATCH 17/17] habanalabs: fix comment style Oded Gabbay

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.