All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/7] habanalabs/gaudi2: print RAZWI info upon PCIe access error
@ 2022-09-18 11:37 Oded Gabbay
  2022-09-18 11:37 ` [PATCH 2/7] habanalabs/gaudi2: increase hard-reset sleep time to 2 sec Oded Gabbay
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: Oded Gabbay @ 2022-09-18 11:37 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

Add the dump of the RAZWI information when a PCIe access is blocked by
RR.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c       | 52 +++++++++++++++++--
 drivers/misc/habanalabs/gaudi2/gaudi2_masks.h | 13 +++++
 2 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 5761ca5d50ae..c040e01adafe 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -7963,14 +7963,58 @@ static void gaudi2_handle_dma_core_event(struct hl_device *hdev, u64 intr_cause_
 						gaudi2_dma_core_interrupts_cause[i]);
 }
 
+static void gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(struct hl_device *hdev)
+{
+	u32 mstr_if_base_addr = mmPCIE_MSTR_RR_MSTR_IF_RR_SHRD_HBW_BASE, razwi_happened_addr;
+
+	razwi_happened_addr = mstr_if_base_addr + RR_SHRD_HBW_AW_RAZWI_HAPPENED;
+	if (RREG32(razwi_happened_addr)) {
+		gaudi2_razwi_rr_hbw_shared_printf_info(hdev, mstr_if_base_addr, true, "PCIE", true,
+							NULL);
+		WREG32(razwi_happened_addr, 0x1);
+	}
+
+	razwi_happened_addr = mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_HAPPENED;
+	if (RREG32(razwi_happened_addr)) {
+		gaudi2_razwi_rr_hbw_shared_printf_info(hdev, mstr_if_base_addr, false, "PCIE", true,
+							NULL);
+		WREG32(razwi_happened_addr, 0x1);
+	}
+
+	razwi_happened_addr = mstr_if_base_addr + RR_SHRD_LBW_AW_RAZWI_HAPPENED;
+	if (RREG32(razwi_happened_addr)) {
+		gaudi2_razwi_rr_lbw_shared_printf_info(hdev, mstr_if_base_addr, true, "PCIE", true,
+							NULL);
+		WREG32(razwi_happened_addr, 0x1);
+	}
+
+	razwi_happened_addr = mstr_if_base_addr + RR_SHRD_LBW_AR_RAZWI_HAPPENED;
+	if (RREG32(razwi_happened_addr)) {
+		gaudi2_razwi_rr_lbw_shared_printf_info(hdev, mstr_if_base_addr, false, "PCIE", true,
+							NULL);
+		WREG32(razwi_happened_addr, 0x1);
+	}
+}
+
 static void gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u64 intr_cause_data)
 {
 	int i;
 
-	for (i = 0 ; i < GAUDI2_NUM_OF_PCIE_ADDR_DEC_ERR_CAUSE; i++)
-		if (intr_cause_data & BIT_ULL(i))
-			dev_err_ratelimited(hdev->dev, "PCIE ADDR DEC Error: %s\n",
-						gaudi2_pcie_addr_dec_error_cause[i]);
+	for (i = 0 ; i < GAUDI2_NUM_OF_PCIE_ADDR_DEC_ERR_CAUSE ; i++) {
+		if (!(intr_cause_data & BIT_ULL(i)))
+			continue;
+
+		dev_err_ratelimited(hdev->dev, "PCIE ADDR DEC Error: %s\n",
+					gaudi2_pcie_addr_dec_error_cause[i]);
+
+		switch (intr_cause_data & BIT_ULL(i)) {
+		case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_AXI_LBW_ERR_INTR_MASK:
+			break;
+		case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_BAD_ACCESS_INTR_MASK:
+			gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev);
+			break;
+		}
+	}
 }
 
 static void gaudi2_handle_pif_fatal(struct hl_device *hdev, u64 intr_cause_data)
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2_masks.h b/drivers/misc/habanalabs/gaudi2/gaudi2_masks.h
index 0239d118abc5..e9ac87828221 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2_masks.h
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2_masks.h
@@ -144,4 +144,17 @@
 #define DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_SIGN_SHIFT	15
 #define DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_SIGN_MASK		0x8000
 
+#define PCIE_WRAP_PCIE_IC_SEI_INTR_IND_AXI_ERR_INTR_SHIFT		0
+#define PCIE_WRAP_PCIE_IC_SEI_INTR_IND_AXI_ERR_INTR_MASK		0x1
+#define PCIE_WRAP_PCIE_IC_SEI_INTR_IND_AXI_LBW_ERR_INTR_SHIFT		1
+#define PCIE_WRAP_PCIE_IC_SEI_INTR_IND_AXI_LBW_ERR_INTR_MASK		0x2
+#define PCIE_WRAP_PCIE_IC_SEI_INTR_IND_BAD_ACCESS_INTR_SHIFT		2
+#define PCIE_WRAP_PCIE_IC_SEI_INTR_IND_BAD_ACCESS_INTR_MASK		0x4
+#define PCIE_WRAP_PCIE_IC_SEI_INTR_IND_AXI_ERR_INTR_MASK_SHIFT		3
+#define PCIE_WRAP_PCIE_IC_SEI_INTR_IND_AXI_ERR_INTR_MASK_MASK		0x8
+#define PCIE_WRAP_PCIE_IC_SEI_INTR_IND_AXI_LBW_ERR_INTR_MASK_SHIFT	4
+#define PCIE_WRAP_PCIE_IC_SEI_INTR_IND_AXI_LBW_ERR_INTR_MASK_MASK	0x10
+#define PCIE_WRAP_PCIE_IC_SEI_INTR_IND_BAD_ACCESS_INTR_MASK_SHIFT	5
+#define PCIE_WRAP_PCIE_IC_SEI_INTR_IND_BAD_ACCESS_INTR_MASK_MASK	0x20
+
 #endif /* GAUDI2_MASKS_H_ */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/7] habanalabs/gaudi2: increase hard-reset sleep time to 2 sec
  2022-09-18 11:37 [PATCH 1/7] habanalabs/gaudi2: print RAZWI info upon PCIe access error Oded Gabbay
@ 2022-09-18 11:37 ` Oded Gabbay
  2022-09-18 11:37 ` [PATCH 3/7] habanalabs/gaudi2: get f/w reset status register dynamically Oded Gabbay
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2022-09-18 11:37 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

The access to the device registers is blocked during hard reset, until
preboot runs and allows the access to specific registers, including the
PSOC BTM_FSM register which is used to know when the reset is done.
Between the reset request and until this register is polled there is a
small delay of 500 msec which is not enough for F/W to process the reset
and for preboot to run, so the register might be accessed while it is
blocked.
To avoid it, increase the delay to 2 sec.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index c040e01adafe..6ed9b3ce16dd 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -21,7 +21,7 @@
 
 #define GAUDI2_DMA_POOL_BLK_SIZE		SZ_256		/* 256 bytes */
 
-#define GAUDI2_RESET_TIMEOUT_MSEC		500		/* 500ms */
+#define GAUDI2_RESET_TIMEOUT_MSEC		2000		/* 2000ms */
 #define GAUDI2_RESET_POLL_TIMEOUT_USEC		50000		/* 50ms */
 #define GAUDI2_PLDM_HRESET_TIMEOUT_MSEC		25000		/* 25s */
 #define GAUDI2_PLDM_SRESET_TIMEOUT_MSEC		25000		/* 25s */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 3/7] habanalabs/gaudi2: get f/w reset status register dynamically
  2022-09-18 11:37 [PATCH 1/7] habanalabs/gaudi2: print RAZWI info upon PCIe access error Oded Gabbay
  2022-09-18 11:37 ` [PATCH 2/7] habanalabs/gaudi2: increase hard-reset sleep time to 2 sec Oded Gabbay
@ 2022-09-18 11:37 ` Oded Gabbay
  2022-09-18 11:37 ` [PATCH 4/7] habanalabs: rename error info structure Oded Gabbay
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2022-09-18 11:37 UTC (permalink / raw)
  To: linux-kernel; +Cc: farah kassabri

From: farah kassabri <fkassabri@habana.ai>

Get the firmware reset status address from the dynamic registers
we read from the firmware instead of using a define.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c             | 5 ++++-
 drivers/misc/habanalabs/include/common/hl_boot_if.h | 4 +++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 6ed9b3ce16dd..b95eab4c237c 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -5439,7 +5439,10 @@ static void gaudi2_execute_soft_reset(struct hl_device *hdev, u32 reset_sleep_ms
 
 	if (!driver_performs_reset) {
 		/* set SP to indicate reset request sent to FW */
-		WREG32(mmCPU_RST_STATUS_TO_HOST, CPU_RST_STATUS_NA);
+		if (dyn_regs->cpu_rst_status)
+			WREG32(le32_to_cpu(dyn_regs->cpu_rst_status), CPU_RST_STATUS_NA);
+		else
+			WREG32(mmCPU_RST_STATUS_TO_HOST, CPU_RST_STATUS_NA);
 
 		WREG32(le32_to_cpu(dyn_regs->gic_host_soft_rst_irq),
 			gaudi2_irq_map_table[GAUDI2_EVENT_CPU_SOFT_RESET].cpu_id);
diff --git a/drivers/misc/habanalabs/include/common/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h
index 2e45be5de4fe..e0ea51cc7475 100644
--- a/drivers/misc/habanalabs/include/common/hl_boot_if.h
+++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h
@@ -431,7 +431,9 @@ struct cpu_dyn_regs {
 	__le32 gic_host_ints_irq;
 	__le32 gic_host_soft_rst_irq;
 	__le32 gic_rot_qm_irq_ctrl;
-	__le32 reserved1[22];		/* reserve for future use */
+	__le32 cpu_rst_status;
+	__le32 eng_arc_irq_ctrl;
+	__le32 reserved1[20];		/* reserve for future use */
 };
 
 /* TODO: remove the desc magic after the code is updated to use message */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 4/7] habanalabs: rename error info structure
  2022-09-18 11:37 [PATCH 1/7] habanalabs/gaudi2: print RAZWI info upon PCIe access error Oded Gabbay
  2022-09-18 11:37 ` [PATCH 2/7] habanalabs/gaudi2: increase hard-reset sleep time to 2 sec Oded Gabbay
  2022-09-18 11:37 ` [PATCH 3/7] habanalabs/gaudi2: get f/w reset status register dynamically Oded Gabbay
@ 2022-09-18 11:37 ` Oded Gabbay
  2022-09-18 11:37 ` [PATCH 5/7] habanalabs/gaudi: change TPC Assert to use TPC DEC instead of QMAN err Oded Gabbay
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2022-09-18 11:37 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

As a preparation for adding more errors to it,
change to more suitable name.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    |  6 ++--
 drivers/misc/habanalabs/common/habanalabs.h   | 12 +++----
 .../misc/habanalabs/common/habanalabs_drv.c   |  6 ++--
 .../misc/habanalabs/common/habanalabs_ioctl.c | 30 +++++++++---------
 drivers/misc/habanalabs/gaudi/gaudi.c         | 31 ++++++++++---------
 5 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 746b688d34cf..fbe5003191bf 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -826,10 +826,10 @@ static void cs_timedout(struct work_struct *work)
 	}
 
 	/* Save only the first CS timeout parameters */
-	rc = atomic_cmpxchg(&hdev->last_error.cs_timeout.write_enable, 1, 0);
+	rc = atomic_cmpxchg(&hdev->captured_err_info.cs_timeout.write_enable, 1, 0);
 	if (rc) {
-		hdev->last_error.cs_timeout.timestamp = ktime_get();
-		hdev->last_error.cs_timeout.seq = cs->sequence;
+		hdev->captured_err_info.cs_timeout.timestamp = ktime_get();
+		hdev->captured_err_info.cs_timeout.seq = cs->sequence;
 
 		event_mask = device_reset ? (HL_NOTIFIER_EVENT_CS_TIMEOUT |
 				HL_NOTIFIER_EVENT_DEVICE_RESET) : HL_NOTIFIER_EVENT_CS_TIMEOUT;
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index f2910ac7aa22..259eebdc2f1b 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2981,12 +2981,12 @@ struct undefined_opcode_info {
 };
 
 /**
- * struct last_error_session_info - info about last session errors occurred.
- * @cs_timeout: CS timeout error last information.
- * @razwi: razwi last information.
+ * struct hl_error_info - holds information collected during an error.
+ * @cs_timeout: CS timeout error information.
+ * @razwi: razwi information.
  * @undef_opcode: undefined opcode information
  */
-struct last_error_session_info {
+struct hl_error_info {
 	struct cs_timeout_info		cs_timeout;
 	struct razwi_info		razwi;
 	struct undefined_opcode_info	undef_opcode;
@@ -3111,7 +3111,7 @@ struct hl_reset_info {
  * @state_dump_specs: constants and dictionaries needed to dump system state.
  * @multi_cs_completion: array of multi-CS completion.
  * @clk_throttling: holds information about current/previous clock throttling events
- * @last_error: holds information about last session in which CS timeout or razwi error occurred.
+ * @captured_err_info: holds information about errors.
  * @reset_info: holds current device reset information.
  * @stream_master_qid_arr: pointer to array with QIDs of master streams.
  * @fw_major_version: major version of current loaded preboot.
@@ -3286,7 +3286,7 @@ struct hl_device {
 	struct multi_cs_completion	multi_cs_completion[
 							MULTI_CS_MAX_USER_CTX];
 	struct hl_clk_throttle		clk_throttling;
-	struct last_error_session_info	last_error;
+	struct hl_error_info		captured_err_info;
 
 	struct hl_reset_info		reset_info;
 
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index c60d6dab7aa7..73ae6f64d3ba 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -211,9 +211,9 @@ int hl_device_open(struct inode *inode, struct file *filp)
 
 	hl_debugfs_add_file(hpriv);
 
-	atomic_set(&hdev->last_error.cs_timeout.write_enable, 1);
-	atomic_set(&hdev->last_error.razwi.write_enable, 1);
-	hdev->last_error.undef_opcode.write_enable = true;
+	atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1);
+	atomic_set(&hdev->captured_err_info.razwi.write_enable, 1);
+	hdev->captured_err_info.undef_opcode.write_enable = true;
 
 	hdev->open_counter++;
 	hdev->last_successful_open_jif = jiffies;
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index c7bd000750c8..ab0be082f3a6 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -593,8 +593,8 @@ static int cs_timeout_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 	if ((!max_size) || (!out))
 		return -EINVAL;
 
-	info.seq = hdev->last_error.cs_timeout.seq;
-	info.timestamp = ktime_to_ns(hdev->last_error.cs_timeout.timestamp);
+	info.seq = hdev->captured_err_info.cs_timeout.seq;
+	info.timestamp = ktime_to_ns(hdev->captured_err_info.cs_timeout.timestamp);
 
 	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
 }
@@ -609,12 +609,12 @@ static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 	if ((!max_size) || (!out))
 		return -EINVAL;
 
-	info.timestamp = ktime_to_ns(hdev->last_error.razwi.timestamp);
-	info.addr = hdev->last_error.razwi.addr;
-	info.engine_id_1 = hdev->last_error.razwi.engine_id_1;
-	info.engine_id_2 = hdev->last_error.razwi.engine_id_2;
-	info.no_engine_id = hdev->last_error.razwi.non_engine_initiator;
-	info.error_type = hdev->last_error.razwi.type;
+	info.timestamp = ktime_to_ns(hdev->captured_err_info.razwi.timestamp);
+	info.addr = hdev->captured_err_info.razwi.addr;
+	info.engine_id_1 = hdev->captured_err_info.razwi.engine_id_1;
+	info.engine_id_2 = hdev->captured_err_info.razwi.engine_id_2;
+	info.no_engine_id = hdev->captured_err_info.razwi.non_engine_initiator;
+	info.error_type = hdev->captured_err_info.razwi.type;
 
 	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
 }
@@ -629,13 +629,13 @@ static int undefined_opcode_info(struct hl_fpriv *hpriv, struct hl_info_args *ar
 	if ((!max_size) || (!out))
 		return -EINVAL;
 
-	info.timestamp = ktime_to_ns(hdev->last_error.undef_opcode.timestamp);
-	info.engine_id = hdev->last_error.undef_opcode.engine_id;
-	info.cq_addr = hdev->last_error.undef_opcode.cq_addr;
-	info.cq_size = hdev->last_error.undef_opcode.cq_size;
-	info.stream_id = hdev->last_error.undef_opcode.stream_id;
-	info.cb_addr_streams_len = hdev->last_error.undef_opcode.cb_addr_streams_len;
-	memcpy(info.cb_addr_streams, hdev->last_error.undef_opcode.cb_addr_streams,
+	info.timestamp = ktime_to_ns(hdev->captured_err_info.undef_opcode.timestamp);
+	info.engine_id = hdev->captured_err_info.undef_opcode.engine_id;
+	info.cq_addr = hdev->captured_err_info.undef_opcode.cq_addr;
+	info.cq_size = hdev->captured_err_info.undef_opcode.cq_size;
+	info.stream_id = hdev->captured_err_info.undef_opcode.stream_id;
+	info.cb_addr_streams_len = hdev->captured_err_info.undef_opcode.cb_addr_streams_len;
+	memcpy(info.cb_addr_streams, hdev->captured_err_info.undef_opcode.cb_addr_streams,
 			sizeof(info.cb_addr_streams));
 
 	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 48ff3b103b9f..f81a141b4741 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -6894,9 +6894,9 @@ static void gaudi_handle_sw_config_stream_data(struct hl_device *hdev, u32 strea
 							stream, cq_ptr, size);
 
 	if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
-		hdev->last_error.undef_opcode.cq_addr = cq_ptr;
-		hdev->last_error.undef_opcode.cq_size = size;
-		hdev->last_error.undef_opcode.stream_id = stream;
+		hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr;
+		hdev->captured_err_info.undef_opcode.cq_size = size;
+		hdev->captured_err_info.undef_opcode.stream_id = stream;
 	}
 }
 
@@ -6962,7 +6962,7 @@ static void gaudi_handle_last_pqes_on_err(struct hl_device *hdev, u32 qid_base,
 	}
 
 	if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
-		struct undefined_opcode_info *undef_opcode = &hdev->last_error.undef_opcode;
+		struct undefined_opcode_info *undef_opcode = &hdev->captured_err_info.undef_opcode;
 		u32 arr_idx = undef_opcode->cb_addr_streams_len;
 
 		if (arr_idx == 0) {
@@ -7046,11 +7046,11 @@ static void gaudi_handle_qman_err_generic(struct hl_device *hdev,
 		}
 		/* check for undefined opcode */
 		if (glbl_sts_val & TPC0_QM_GLBL_STS1_CP_UNDEF_CMD_ERR_MASK &&
-				hdev->last_error.undef_opcode.write_enable) {
-			memset(&hdev->last_error.undef_opcode, 0,
-						sizeof(hdev->last_error.undef_opcode));
+				hdev->captured_err_info.undef_opcode.write_enable) {
+			memset(&hdev->captured_err_info.undef_opcode, 0,
+						sizeof(hdev->captured_err_info.undef_opcode));
 
-			hdev->last_error.undef_opcode.write_enable = false;
+			hdev->captured_err_info.undef_opcode.write_enable = false;
 			*event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
 		}
 
@@ -7332,18 +7332,19 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
 		gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, &razwi_type);
 
 		/* In case it's the first razwi, save its parameters*/
-		rc = atomic_cmpxchg(&hdev->last_error.razwi.write_enable, 1, 0);
+		rc = atomic_cmpxchg(&hdev->captured_err_info.razwi.write_enable, 1, 0);
 		if (rc) {
-			hdev->last_error.razwi.timestamp = ktime_get();
-			hdev->last_error.razwi.addr = razwi_addr;
-			hdev->last_error.razwi.engine_id_1 = engine_id_1;
-			hdev->last_error.razwi.engine_id_2 = engine_id_2;
+			hdev->captured_err_info.razwi.timestamp = ktime_get();
+			hdev->captured_err_info.razwi.addr = razwi_addr;
+			hdev->captured_err_info.razwi.engine_id_1 = engine_id_1;
+			hdev->captured_err_info.razwi.engine_id_2 = engine_id_2;
 			/*
 			 * If first engine id holds non valid value the razwi initiator
 			 * does not have engine id
 			 */
-			hdev->last_error.razwi.non_engine_initiator = (engine_id_1 == U16_MAX);
-			hdev->last_error.razwi.type = razwi_type;
+			hdev->captured_err_info.razwi.non_engine_initiator =
+									(engine_id_1 == U16_MAX);
+			hdev->captured_err_info.razwi.type = razwi_type;
 
 		}
 	}
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 5/7] habanalabs/gaudi: change TPC Assert to use TPC DEC instead of QMAN err
  2022-09-18 11:37 [PATCH 1/7] habanalabs/gaudi2: print RAZWI info upon PCIe access error Oded Gabbay
                   ` (2 preceding siblings ...)
  2022-09-18 11:37 ` [PATCH 4/7] habanalabs: rename error info structure Oded Gabbay
@ 2022-09-18 11:37 ` Oded Gabbay
  2022-09-18 11:37 ` [PATCH 6/7] habanalabs/gaudi2: add handling to pmmu events in eqe handler Oded Gabbay
  2022-09-18 11:37 ` [PATCH 7/7] habanalabs/gaudi2: add secured attestation info uapi Oded Gabbay
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2022-09-18 11:37 UTC (permalink / raw)
  To: linux-kernel; +Cc: Tal Cohen

From: Tal Cohen <talcohen@habana.ai>

This change is done while there is a problem to use QMAN error for
TPC assert async. The problem involves security limitation that exists
to generate the assert via QMAN error.

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index f81a141b4741..e80ebace49c8 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7216,12 +7216,6 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e
 
 	switch (event_type) {
 	case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM:
-		/* In TPC QM event, notify on TPC assertion. While there isn't
-		 * a specific event for assertion yet, the FW generates QM event.
-		 * The SW upper layer will inspect an internal mapped area to indicate
-		 * if the event is a tpc assertion or tpc QM.
-		 */
-		*event_mask |= HL_NOTIFIER_EVENT_TPC_ASSERT;
 		index = event_type - GAUDI_EVENT_TPC0_QM;
 		qid_base = GAUDI_QUEUE_ID_TPC_0_0 + index * QMAN_STREAMS;
 		qman_base = mmTPC0_QM_BASE + index * TPC_QMAN_OFFSET;
@@ -7731,6 +7725,12 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
 	case GAUDI_EVENT_TPC5_DEC:
 	case GAUDI_EVENT_TPC6_DEC:
 	case GAUDI_EVENT_TPC7_DEC:
+		/* In TPC DEC event, notify on TPC assertion. While there isn't
+		 * a specific event for assertion yet, the FW generates TPC DEC event.
+		 * The SW upper layer will inspect an internal mapped area to indicate
+		 * if the event is a TPC Assertion or a "real" TPC DEC.
+		 */
+		event_mask |= HL_NOTIFIER_EVENT_TPC_ASSERT;
 		gaudi_print_irq_info(hdev, event_type, true);
 		reset_required = gaudi_tpc_read_interrupts(hdev,
 					tpc_dec_event_to_tpc_id(event_type),
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 6/7] habanalabs/gaudi2: add handling to pmmu events in eqe handler
  2022-09-18 11:37 [PATCH 1/7] habanalabs/gaudi2: print RAZWI info upon PCIe access error Oded Gabbay
                   ` (3 preceding siblings ...)
  2022-09-18 11:37 ` [PATCH 5/7] habanalabs/gaudi: change TPC Assert to use TPC DEC instead of QMAN err Oded Gabbay
@ 2022-09-18 11:37 ` Oded Gabbay
  2022-09-18 11:37 ` [PATCH 7/7] habanalabs/gaudi2: add secured attestation info uapi Oded Gabbay
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2022-09-18 11:37 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

In order to get the error cause and the captured address in case of
page fault, added pmmu events to eqe handler.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index b95eab4c237c..b8b8b2dc2095 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -8756,6 +8756,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 
 	case GAUDI2_EVENT_HMMU0_PAGE_FAULT_OR_WR_PERM ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR:
 	case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP:
+	case GAUDI2_EVENT_PMMU0_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_PMMU0_SECURITY_ERROR:
 	case GAUDI2_EVENT_PMMU_AXI_ERR_RSP_0:
 		gaudi2_handle_mmu_spi_sei_err(hdev, event_type);
 		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 7/7] habanalabs/gaudi2: add secured attestation info uapi
  2022-09-18 11:37 [PATCH 1/7] habanalabs/gaudi2: print RAZWI info upon PCIe access error Oded Gabbay
                   ` (4 preceding siblings ...)
  2022-09-18 11:37 ` [PATCH 6/7] habanalabs/gaudi2: add handling to pmmu events in eqe handler Oded Gabbay
@ 2022-09-18 11:37 ` Oded Gabbay
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2022-09-18 11:37 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

User will provide a nonce via the ioctl, and will retrieve
secured attestation data of the boot, generated using given
nonce.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c  | 46 +++++++++++
 drivers/misc/habanalabs/common/habanalabs.h   |  3 +
 .../misc/habanalabs/common/habanalabs_ioctl.c | 52 +++++++++++++
 .../misc/habanalabs/include/common/cpucp_if.h | 77 ++++++++++++++++++-
 include/uapi/misc/habanalabs.h                | 43 +++++++++++
 5 files changed, 219 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index c2375917fc02..26a7529083e1 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -2988,3 +2988,49 @@ void hl_fw_set_max_power(struct hl_device *hdev)
 	if (rc)
 		dev_err(hdev->dev, "Failed to set max power, error %d\n", rc);
 }
+
+static int hl_fw_get_sec_attest_data(struct hl_device *hdev, u32 packet_id, void *data, u32 size,
+					u32 nonce, u32 timeout)
+{
+	struct cpucp_packet pkt = {};
+	dma_addr_t req_dma_addr;
+	void *req_cpu_addr;
+	int rc;
+
+	req_cpu_addr = hl_cpu_accessible_dma_pool_alloc(hdev, size, &req_dma_addr);
+	if (!data) {
+		dev_err(hdev->dev,
+			"Failed to allocate DMA memory for CPU-CP packet %u\n", packet_id);
+		return -ENOMEM;
+	}
+
+	memset(data, 0, size);
+
+	pkt.ctl = cpu_to_le32(packet_id << CPUCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.addr = cpu_to_le64(req_dma_addr);
+	pkt.data_max_size = cpu_to_le32(size);
+	pkt.nonce = cpu_to_le32(nonce);
+
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+					timeout, NULL);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to handle CPU-CP pkt %u, error %d\n", packet_id, rc);
+		goto out;
+	}
+
+	memcpy(data, req_cpu_addr, size);
+
+out:
+	hl_cpu_accessible_dma_pool_free(hdev, size, req_cpu_addr);
+
+	return rc;
+}
+
+int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_info *sec_attest_info,
+				u32 nonce)
+{
+	return hl_fw_get_sec_attest_data(hdev, CPUCP_PACKET_SEC_ATTEST_GET, sec_attest_info,
+					sizeof(struct cpucp_sec_attest_info), nonce,
+					HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC);
+}
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 259eebdc2f1b..2ffb8378f565 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -66,6 +66,7 @@ struct hl_fpriv;
 #define HL_CPUCP_INFO_TIMEOUT_USEC	10000000 /* 10s */
 #define HL_CPUCP_EEPROM_TIMEOUT_USEC	10000000 /* 10s */
 #define HL_CPUCP_MON_DUMP_TIMEOUT_USEC	10000000 /* 10s */
+#define HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC 10000000 /* 10s */
 
 #define HL_FW_STATUS_POLL_INTERVAL_USEC		10000 /* 10ms */
 #define HL_FW_COMMS_STATUS_PLDM_POLL_INTERVAL_USEC	1000000 /* 1s */
@@ -3750,6 +3751,8 @@ int hl_get_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr, long *va
 void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr, long value);
 long hl_fw_get_max_power(struct hl_device *hdev);
 void hl_fw_set_max_power(struct hl_device *hdev);
+int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_info *sec_attest_info,
+				u32 nonce);
 int hl_set_voltage(struct hl_device *hdev, int sensor_index, u32 attr, long value);
 int hl_set_current(struct hl_device *hdev, int sensor_index, u32 attr, long value);
 int hl_set_power(struct hl_device *hdev, int sensor_index, u32 attr, long value);
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index ab0be082f3a6..43afe40966e5 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -662,6 +662,55 @@ static int dev_mem_alloc_page_sizes_info(struct hl_fpriv *hpriv, struct hl_info_
 	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
 }
 
+static int sec_attest_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	struct cpucp_sec_attest_info *sec_attest_info;
+	struct hl_info_sec_attest *info;
+	u32 max_size = args->return_size;
+	int rc;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	sec_attest_info = kmalloc(sizeof(*sec_attest_info), GFP_KERNEL);
+	if (!sec_attest_info)
+		return -ENOMEM;
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		rc = -ENOMEM;
+		goto free_sec_attest_info;
+	}
+
+	rc = hl_fw_get_sec_attest_info(hpriv->hdev, sec_attest_info, args->sec_attest_nonce);
+	if (rc)
+		goto free_info;
+
+	info->nonce = le32_to_cpu(sec_attest_info->nonce);
+	info->pcr_quote_len = le16_to_cpu(sec_attest_info->pcr_quote_len);
+	info->pub_data_len = le16_to_cpu(sec_attest_info->pub_data_len);
+	info->certificate_len = le16_to_cpu(sec_attest_info->certificate_len);
+	info->pcr_num_reg = sec_attest_info->pcr_num_reg;
+	info->pcr_reg_len = sec_attest_info->pcr_reg_len;
+	info->quote_sig_len = sec_attest_info->quote_sig_len;
+	memcpy(&info->pcr_data, &sec_attest_info->pcr_data, sizeof(info->pcr_data));
+	memcpy(&info->pcr_quote, &sec_attest_info->pcr_quote, sizeof(info->pcr_quote));
+	memcpy(&info->public_data, &sec_attest_info->public_data, sizeof(info->public_data));
+	memcpy(&info->certificate, &sec_attest_info->certificate, sizeof(info->certificate));
+	memcpy(&info->quote_sig, &sec_attest_info->quote_sig, sizeof(info->quote_sig));
+
+	rc = copy_to_user(out, info,
+				min_t(size_t, max_size, sizeof(*info))) ? -EFAULT : 0;
+
+free_info:
+	kfree(info);
+free_sec_attest_info:
+	kfree(sec_attest_info);
+
+	return rc;
+}
+
 static int eventfd_register(struct hl_fpriv *hpriv, struct hl_info_args *args)
 {
 	int rc;
@@ -844,6 +893,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_DRAM_PENDING_ROWS:
 		return dram_pending_rows_info(hpriv, args);
 
+	case HL_INFO_SECURED_ATTESTATION:
+		return sec_attest_info(hpriv, args);
+
 	case HL_INFO_REGISTER_EVENTFD:
 		return eventfd_register(hpriv, args);
 
diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h
index 9593d1a26945..baa5aa43b6f4 100644
--- a/drivers/misc/habanalabs/include/common/cpucp_if.h
+++ b/drivers/misc/habanalabs/include/common/cpucp_if.h
@@ -629,6 +629,12 @@ enum pq_init_status {
  * CPUCP_PACKET_ENGINE_CORE_ASID_SET -
  *       Packet to perform engine core ASID configuration
  *
+ * CPUCP_PACKET_SEC_ATTEST_GET -
+ *       Get the attestaion data that is collected during various stages of the
+ *       boot sequence. the attestation data is also hashed with some unique
+ *       number (nonce) provided by the host to prevent replay attacks.
+ *       public key and certificate also provided as part of the FW response.
+ *
  * CPUCP_PACKET_MONITOR_DUMP_GET -
  *       Get monitors registers dump from the CpuCP kernel.
  *       The CPU will put the registers dump in the a buffer allocated by the driver
@@ -691,15 +697,15 @@ enum cpucp_packet_id {
 	CPUCP_PACKET_RESERVED,			/* not used */
 	CPUCP_PACKET_ENGINE_CORE_ASID_SET,	/* internal */
 	CPUCP_PACKET_RESERVED2,			/* not used */
+	CPUCP_PACKET_SEC_ATTEST_GET,		/* internal */
 	CPUCP_PACKET_RESERVED3,			/* not used */
 	CPUCP_PACKET_RESERVED4,			/* not used */
-	CPUCP_PACKET_RESERVED5,			/* not used */
 	CPUCP_PACKET_MONITOR_DUMP_GET,		/* debugfs */
+	CPUCP_PACKET_RESERVED5,			/* not used */
 	CPUCP_PACKET_RESERVED6,			/* not used */
 	CPUCP_PACKET_RESERVED7,			/* not used */
 	CPUCP_PACKET_RESERVED8,			/* not used */
 	CPUCP_PACKET_RESERVED9,			/* not used */
-	CPUCP_PACKET_RESERVED10,		/* not used */
 	CPUCP_PACKET_ACTIVE_STATUS_SET,		/* internal */
 	CPUCP_PACKET_ID_MAX			/* must be last */
 };
@@ -794,6 +800,9 @@ struct cpucp_packet {
 		 * result cannot be used to hold general purpose data.
 		 */
 		__le32 status_mask;
+
+		/* random, used once number, for security packets */
+		__le32 nonce;
 	};
 
 	/* For NIC requests */
@@ -1219,6 +1228,70 @@ enum cpu_reset_status {
 	CPU_RST_STATUS_SOFT_RST_DONE = 1,
 };
 
+#define SEC_PCR_DATA_BUF_SZ	256
+#define SEC_PCR_QUOTE_BUF_SZ	510	/* (512 - 2) 2 bytes used for size */
+#define SEC_SIGNATURE_BUF_SZ	255	/* (256 - 1) 1 byte used for size */
+#define SEC_PUB_DATA_BUF_SZ	510	/* (512 - 2) 2 bytes used for size */
+#define SEC_CERTIFICATE_BUF_SZ	2046	/* (2048 - 2) 2 bytes used for size */
+
+/*
+ * struct cpucp_sec_attest_info - attestation report of the boot
+ * @pcr_data: raw values of the PCR registers
+ * @pcr_num_reg: number of PCR registers in the pcr_data array
+ * @pcr_reg_len: length of each PCR register in the pcr_data array (bytes)
+ * @nonce: number only used once. random number provided by host. this also
+ *	    passed to the quote command as a qualifying data.
+ * @pcr_quote_len: length of the attestation quote data (bytes)
+ * @pcr_quote: attestation report data structure
+ * @quote_sig_len: length of the attestation report signature (bytes)
+ * @quote_sig: signature structure of the attestation report
+ * @pub_data_len: length of the public data (bytes)
+ * @public_data: public key for the signed attestation
+ *		 (outPublic + name + qualifiedName)
+ * @certificate_len: length of the certificate (bytes)
+ * @certificate: certificate for the attestation signing key
+ */
+struct cpucp_sec_attest_info {
+	__u8 pcr_data[SEC_PCR_DATA_BUF_SZ];
+	__u8 pcr_num_reg;
+	__u8 pcr_reg_len;
+	__le16 pad0;
+	__le32 nonce;
+	__le16 pcr_quote_len;
+	__u8 pcr_quote[SEC_PCR_QUOTE_BUF_SZ];
+	__u8 quote_sig_len;
+	__u8 quote_sig[SEC_SIGNATURE_BUF_SZ];
+	__le16 pub_data_len;
+	__u8 public_data[SEC_PUB_DATA_BUF_SZ];
+	__le16 certificate_len;
+	__u8 certificate[SEC_CERTIFICATE_BUF_SZ];
+};
+
+/*
+ * struct cpucp_dev_info_signed - device information signed by a secured device
+ * @info: device information structure as defined above
+ * @nonce: number only used once. random number provided by host. this number is
+ *	   hashed and signed along with the device information.
+ * @info_sig_len: length of the attestation signature (bytes)
+ * @info_sig: signature of the info + nonce data.
+ * @pub_data_len: length of the public data (bytes)
+ * @public_data: public key info signed info data
+ *		 (outPublic + name + qualifiedName)
+ * @certificate_len: length of the certificate (bytes)
+ * @certificate: certificate for the signing key
+ */
+struct cpucp_dev_info_signed {
+	struct cpucp_info info;	/* assumed to be 64bit aligned */
+	__le32 nonce;
+	__le32 pad0;
+	__u8 info_sig_len;
+	__u8 info_sig[SEC_SIGNATURE_BUF_SZ];
+	__le16 pub_data_len;
+	__u8 public_data[SEC_PUB_DATA_BUF_SZ];
+	__le16 certificate_len;
+	__u8 certificate[SEC_CERTIFICATE_BUF_SZ];
+};
+
 /*
  * struct dcore_monitor_regs_data - DCORE monitor regs data.
  * the structure follows sync manager block layout. relevant only to Gaudi.
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index a4bab0fd8223..e00ebe05097d 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -773,6 +773,7 @@ enum hl_server_type {
  *                            Razwi initiator.
  *                            Razwi cause, was it a page fault or MMU access error.
  * HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES - Retrieve valid page sizes for device memory allocation
+ * HL_INFO_SECURED_ATTESTATION - Retrieve attestation report of the boot.
  * HL_INFO_REGISTER_EVENTFD   - Register eventfd for event notifications.
  * HL_INFO_UNREGISTER_EVENTFD - Unregister eventfd
  * HL_INFO_GET_EVENTS         - Retrieve the last occurred events
@@ -802,6 +803,7 @@ enum hl_server_type {
 #define HL_INFO_CS_TIMEOUT_EVENT		24
 #define HL_INFO_RAZWI_EVENT			25
 #define HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES	26
+#define HL_INFO_SECURED_ATTESTATION		27
 #define HL_INFO_REGISTER_EVENTFD		28
 #define HL_INFO_UNREGISTER_EVENTFD		29
 #define HL_INFO_GET_EVENTS			30
@@ -1133,6 +1135,45 @@ struct hl_info_dev_memalloc_page_sizes {
 	__u64 page_order_bitmask;
 };
 
+#define SEC_PCR_DATA_BUF_SZ	256
+#define SEC_PCR_QUOTE_BUF_SZ	510	/* (512 - 2) 2 bytes used for size */
+#define SEC_SIGNATURE_BUF_SZ	255	/* (256 - 1) 1 byte used for size */
+#define SEC_PUB_DATA_BUF_SZ	510	/* (512 - 2) 2 bytes used for size */
+#define SEC_CERTIFICATE_BUF_SZ	2046	/* (2048 - 2) 2 bytes used for size */
+
+/*
+ * struct hl_info_sec_attest - attestation report of the boot
+ * @nonce: number only used once. random number provided by host. this also passed to the quote
+ *         command as a qualifying data.
+ * @pcr_quote_len: length of the attestation quote data (bytes)
+ * @pub_data_len: length of the public data (bytes)
+ * @certificate_len: length of the certificate (bytes)
+ * @pcr_num_reg: number of PCR registers in the pcr_data array
+ * @pcr_reg_len: length of each PCR register in the pcr_data array (bytes)
+ * @quote_sig_len: length of the attestation report signature (bytes)
+ * @pcr_data: raw values of the PCR registers
+ * @pcr_quote: attestation report data structure
+ * @quote_sig: signature structure of the attestation report
+ * @public_data: public key for the signed attestation
+ *		 (outPublic + name + qualifiedName)
+ * @certificate: certificate for the attestation signing key
+ */
+struct hl_info_sec_attest {
+	__u32 nonce;
+	__u16 pcr_quote_len;
+	__u16 pub_data_len;
+	__u16 certificate_len;
+	__u8 pcr_num_reg;
+	__u8 pcr_reg_len;
+	__u8 quote_sig_len;
+	__u8 pcr_data[SEC_PCR_DATA_BUF_SZ];
+	__u8 pcr_quote[SEC_PCR_QUOTE_BUF_SZ];
+	__u8 quote_sig[SEC_SIGNATURE_BUF_SZ];
+	__u8 public_data[SEC_PUB_DATA_BUF_SZ];
+	__u8 certificate[SEC_CERTIFICATE_BUF_SZ];
+	__u8 pad0[2];
+};
+
 enum gaudi_dcores {
 	HL_GAUDI_WS_DCORE,
 	HL_GAUDI_WN_DCORE,
@@ -1158,6 +1199,7 @@ enum gaudi_dcores {
  *                           driver. It is possible for the user to allocate buffer larger than
  *                           needed, hence updating this variable so user will know the exact amount
  *                           of bytes copied by the kernel to the buffer.
+ * @sec_attest_nonce: Nonce number used for attestation report.
  * @pad: Padding to 64 bit.
  */
 struct hl_info_args {
@@ -1172,6 +1214,7 @@ struct hl_info_args {
 		__u32 pll_index;
 		__u32 eventfd;
 		__u32 user_buffer_actual_size;
+		__u32 sec_attest_nonce;
 	};
 
 	__u32 pad;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-09-18 11:38 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-18 11:37 [PATCH 1/7] habanalabs/gaudi2: print RAZWI info upon PCIe access error Oded Gabbay
2022-09-18 11:37 ` [PATCH 2/7] habanalabs/gaudi2: increase hard-reset sleep time to 2 sec Oded Gabbay
2022-09-18 11:37 ` [PATCH 3/7] habanalabs/gaudi2: get f/w reset status register dynamically Oded Gabbay
2022-09-18 11:37 ` [PATCH 4/7] habanalabs: rename error info structure Oded Gabbay
2022-09-18 11:37 ` [PATCH 5/7] habanalabs/gaudi: change TPC Assert to use TPC DEC instead of QMAN err Oded Gabbay
2022-09-18 11:37 ` [PATCH 6/7] habanalabs/gaudi2: add handling to pmmu events in eqe handler Oded Gabbay
2022-09-18 11:37 ` [PATCH 7/7] habanalabs/gaudi2: add secured attestation info uapi Oded Gabbay

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.