* [PATCH 1/4] habanalabs: Use 'dma_set_mask_and_coherent()'
@ 2021-01-12 19:09 Oded Gabbay
2021-01-12 19:09 ` [PATCH 2/4] habanalabs/gaudi: print sync manager SEI interrupt info Oded Gabbay
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Oded Gabbay @ 2021-01-12 19:09 UTC (permalink / raw)
To: linux-kernel; +Cc: Christophe JAILLET
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Axe 'hl_pci_set_dma_mask()' and replace it with an equivalent
'dma_set_mask_and_coherent()' call.
This makes the code a bit less verbose.
It also removes an erroneous comment, because 'hl_pci_set_dma_mask()'
does not try to use a fall-back value.
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
drivers/misc/habanalabs/common/pci/pci.c | 43 ++++--------------------
1 file changed, 7 insertions(+), 36 deletions(-)
diff --git a/drivers/misc/habanalabs/common/pci/pci.c b/drivers/misc/habanalabs/common/pci/pci.c
index c56ec1574127..b799f9258fb0 100644
--- a/drivers/misc/habanalabs/common/pci/pci.c
+++ b/drivers/misc/habanalabs/common/pci/pci.c
@@ -307,40 +307,6 @@ int hl_pci_set_outbound_region(struct hl_device *hdev,
return rc;
}
-/**
- * hl_pci_set_dma_mask() - Set DMA masks for the device.
- * @hdev: Pointer to hl_device structure.
- *
- * This function sets the DMA masks (regular and consistent) for a specified
- * value. If it doesn't succeed, it tries to set it to a fall-back value
- *
- * Return: 0 on success, non-zero for failure.
- */
-static int hl_pci_set_dma_mask(struct hl_device *hdev)
-{
- struct pci_dev *pdev = hdev->pdev;
- int rc;
-
- /* set DMA mask */
- rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(hdev->dma_mask));
- if (rc) {
- dev_err(hdev->dev,
- "Failed to set pci dma mask to %d bits, error %d\n",
- hdev->dma_mask, rc);
- return rc;
- }
-
- rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(hdev->dma_mask));
- if (rc) {
- dev_err(hdev->dev,
- "Failed to set pci consistent dma mask to %d bits, error %d\n",
- hdev->dma_mask, rc);
- return rc;
- }
-
- return 0;
-}
-
/**
* hl_pci_init() - PCI initialization code.
* @hdev: Pointer to hl_device structure.
@@ -377,9 +343,14 @@ int hl_pci_init(struct hl_device *hdev)
goto unmap_pci_bars;
}
- rc = hl_pci_set_dma_mask(hdev);
- if (rc)
+ rc = dma_set_mask_and_coherent(&pdev->dev,
+ DMA_BIT_MASK(hdev->dma_mask));
+ if (rc) {
+ dev_err(hdev->dev,
+ "Failed to set dma mask to %d bits, error %d\n",
+ hdev->dma_mask, rc);
goto unmap_pci_bars;
+ }
return 0;
--
2.25.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 2/4] habanalabs/gaudi: print sync manager SEI interrupt info
2021-01-12 19:09 [PATCH 1/4] habanalabs: Use 'dma_set_mask_and_coherent()' Oded Gabbay
@ 2021-01-12 19:09 ` Oded Gabbay
2021-01-12 19:09 ` [PATCH 3/4] habanalabs: ignore F/W BMC errors in case no BMC present Oded Gabbay
2021-01-12 19:09 ` [PATCH 4/4] habanalabs: add security violations dump to debugfs Oded Gabbay
2 siblings, 0 replies; 4+ messages in thread
From: Oded Gabbay @ 2021-01-12 19:09 UTC (permalink / raw)
To: linux-kernel; +Cc: Ofir Bitton
From: Ofir Bitton <obitton@habana.ai>
Driver must print sync manager SEI information upon receiving
interrupt from FW.
Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
drivers/misc/habanalabs/gaudi/gaudi.c | 41 +++++++++++++++++++
.../misc/habanalabs/include/common/cpucp_if.h | 7 ++++
.../include/gaudi/gaudi_async_events.h | 4 ++
3 files changed, 52 insertions(+)
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 4b602aa7a6a3..126650e3a9ad 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -225,6 +225,12 @@ gaudi_qman_arb_error_cause[GAUDI_NUM_OF_QM_ARB_ERR_CAUSE] = {
"MSG AXI LBW returned with error"
};
+enum gaudi_sm_sei_cause {
+ GAUDI_SM_SEI_SO_OVERFLOW,
+ GAUDI_SM_SEI_LBW_4B_UNALIGNED,
+ GAUDI_SM_SEI_AXI_RESPONSE_ERR
+};
+
static enum hl_queue_type gaudi_queue_type[GAUDI_QUEUE_ID_SIZE] = {
QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_0_0 */
QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_0_1 */
@@ -6845,6 +6851,34 @@ static void gaudi_handle_qman_err_generic(struct hl_device *hdev,
}
}
+static void gaudi_print_sm_sei_info(struct hl_device *hdev, u16 event_type,
+ struct hl_eq_sm_sei_data *sei_data)
+{
+ u32 index = event_type - GAUDI_EVENT_DMA_IF_SEI_0;
+
+ switch (sei_data->sei_cause) {
+ case GAUDI_SM_SEI_SO_OVERFLOW:
+ dev_err(hdev->dev,
+ "SM %u SEI Error: SO %u overflow/underflow",
+ index, le16_to_cpu(sei_data->sei_log));
+ break;
+ case GAUDI_SM_SEI_LBW_4B_UNALIGNED:
+ dev_err(hdev->dev,
+ "SM %u SEI Error: Unaligned 4B LBW access, monitor agent address low - %#x",
+ index, le16_to_cpu(sei_data->sei_log));
+ break;
+ case GAUDI_SM_SEI_AXI_RESPONSE_ERR:
+ dev_err(hdev->dev,
+ "SM %u SEI Error: AXI ID %u response error",
+ index, le16_to_cpu(sei_data->sei_log));
+ break;
+ default:
+ dev_err(hdev->dev, "Unknown SM SEI cause %u",
+ le16_to_cpu(sei_data->sei_log));
+ break;
+ }
+}
+
static void gaudi_handle_ecc_event(struct hl_device *hdev, u16 event_type,
struct hl_eq_ecc_data *ecc_data)
{
@@ -7468,6 +7502,13 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
hl_fw_unmask_irq(hdev, event_type);
break;
+ case GAUDI_EVENT_DMA_IF_SEI_0 ... GAUDI_EVENT_DMA_IF_SEI_3:
+ gaudi_print_irq_info(hdev, event_type, false);
+ gaudi_print_sm_sei_info(hdev, event_type,
+ &eq_entry->sm_sei_data);
+ hl_fw_unmask_irq(hdev, event_type);
+ break;
+
case GAUDI_EVENT_FIX_POWER_ENV_S ... GAUDI_EVENT_FIX_THERMAL_ENV_E:
gaudi_print_clk_change_info(hdev, event_type);
hl_fw_unmask_irq(hdev, event_type);
diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h
index 00bd9b392f93..d75d1077461b 100644
--- a/drivers/misc/habanalabs/include/common/cpucp_if.h
+++ b/drivers/misc/habanalabs/include/common/cpucp_if.h
@@ -58,11 +58,18 @@ struct hl_eq_ecc_data {
__u8 pad[7];
};
+struct hl_eq_sm_sei_data {
+ __le16 sei_log;
+ __u8 sei_cause;
+ __u8 pad[5];
+};
+
struct hl_eq_entry {
struct hl_eq_header hdr;
union {
struct hl_eq_ecc_data ecc_data;
struct hl_eq_hbm_ecc_data hbm_ecc_data;
+ struct hl_eq_sm_sei_data sm_sei_data;
__le64 data[7];
};
};
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h b/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h
index 9ccba8437ec9..49335e8334b4 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h
@@ -212,6 +212,10 @@ enum gaudi_async_event_id {
GAUDI_EVENT_NIC_SEI_2 = 266,
GAUDI_EVENT_NIC_SEI_3 = 267,
GAUDI_EVENT_NIC_SEI_4 = 268,
+ GAUDI_EVENT_DMA_IF_SEI_0 = 277,
+ GAUDI_EVENT_DMA_IF_SEI_1 = 278,
+ GAUDI_EVENT_DMA_IF_SEI_2 = 279,
+ GAUDI_EVENT_DMA_IF_SEI_3 = 280,
GAUDI_EVENT_PCIE_FLR = 290,
GAUDI_EVENT_TPC0_BMON_SPMU = 300,
GAUDI_EVENT_TPC0_KRN_ERR = 301,
--
2.25.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 3/4] habanalabs: ignore F/W BMC errors in case no BMC present
2021-01-12 19:09 [PATCH 1/4] habanalabs: Use 'dma_set_mask_and_coherent()' Oded Gabbay
2021-01-12 19:09 ` [PATCH 2/4] habanalabs/gaudi: print sync manager SEI interrupt info Oded Gabbay
@ 2021-01-12 19:09 ` Oded Gabbay
2021-01-12 19:09 ` [PATCH 4/4] habanalabs: add security violations dump to debugfs Oded Gabbay
2 siblings, 0 replies; 4+ messages in thread
From: Oded Gabbay @ 2021-01-12 19:09 UTC (permalink / raw)
To: linux-kernel; +Cc: Ofir Bitton
From: Ofir Bitton <obitton@habana.ai>
In order to support operation mode in which BMC is not active,
driver must not take BMC errors into consideration.
Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
drivers/misc/habanalabs/common/firmware_if.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index a6ea5bbeb699..dcd6c3614044 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -308,9 +308,15 @@ static int fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg,
if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED)
dev_warn(hdev->dev,
"Device boot warning - Skipped DRAM initialization\n");
- if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED)
- dev_warn(hdev->dev,
- "Device boot error - Skipped waiting for BMC\n");
+
+ if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) {
+ if (hdev->bmc_enable)
+ dev_warn(hdev->dev,
+ "Device boot error - Skipped waiting for BMC\n");
+ else
+ err_val &= ~CPU_BOOT_ERR0_BMC_WAIT_SKIPPED;
+ }
+
if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY)
dev_err(hdev->dev,
"Device boot error - Serdes data from BMC not available\n");
--
2.25.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 4/4] habanalabs: add security violations dump to debugfs
2021-01-12 19:09 [PATCH 1/4] habanalabs: Use 'dma_set_mask_and_coherent()' Oded Gabbay
2021-01-12 19:09 ` [PATCH 2/4] habanalabs/gaudi: print sync manager SEI interrupt info Oded Gabbay
2021-01-12 19:09 ` [PATCH 3/4] habanalabs: ignore F/W BMC errors in case no BMC present Oded Gabbay
@ 2021-01-12 19:09 ` Oded Gabbay
2 siblings, 0 replies; 4+ messages in thread
From: Oded Gabbay @ 2021-01-12 19:09 UTC (permalink / raw)
To: linux-kernel; +Cc: Ofir Bitton
From: Ofir Bitton <obitton@habana.ai>
In order to improve driver security debuggability, we add
security violations dump to debugfs.
Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
.../ABI/testing/debugfs-driver-habanalabs | 8 +++++++
drivers/misc/habanalabs/common/debugfs.c | 22 +++++++++++++++++++
drivers/misc/habanalabs/common/habanalabs.h | 2 ++
drivers/misc/habanalabs/gaudi/gaudi.c | 3 ++-
drivers/misc/habanalabs/gaudi/gaudiP.h | 1 +
.../misc/habanalabs/gaudi/gaudi_security.c | 5 +++++
drivers/misc/habanalabs/goya/goya.c | 3 ++-
drivers/misc/habanalabs/goya/goyaP.h | 1 +
drivers/misc/habanalabs/goya/goya_security.c | 5 +++++
9 files changed, 48 insertions(+), 2 deletions(-)
diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs
index c5d678d39144..3979bfdaa080 100644
--- a/Documentation/ABI/testing/debugfs-driver-habanalabs
+++ b/Documentation/ABI/testing/debugfs-driver-habanalabs
@@ -182,3 +182,11 @@ KernelVersion: 5.6
Contact: oded.gabbay@gmail.com
Description: Sets the stop-on_error option for the device engines. Value of
"0" is for disable, otherwise enable.
+
+What: /sys/kernel/debug/habanalabs/hl<n>/dump_security_violations
+Date: Jan 2021
+KernelVersion: 5.12
+Contact: oded.gabbay@gmail.com
+Description: Dumps all security violations to dmesg. This will also ack
+ all security violations meanings those violations will not be
+ dumped next time user calls this API
diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 50ca8eea6648..323d0381a60a 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -867,6 +867,17 @@ static ssize_t hl_stop_on_err_write(struct file *f, const char __user *buf,
return count;
}
+static ssize_t hl_security_violations_read(struct file *f, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+ struct hl_device *hdev = entry->hdev;
+
+ hdev->asic_funcs->ack_protection_bits_errors(hdev);
+
+ return 0;
+}
+
static const struct file_operations hl_data32b_fops = {
.owner = THIS_MODULE,
.read = hl_data_read32,
@@ -924,6 +935,11 @@ static const struct file_operations hl_stop_on_err_fops = {
.write = hl_stop_on_err_write
};
+static const struct file_operations hl_security_violations_fops = {
+ .owner = THIS_MODULE,
+ .read = hl_security_violations_read
+};
+
static const struct hl_info_list hl_debugfs_list[] = {
{"command_buffers", command_buffers_show, NULL},
{"command_submission", command_submission_show, NULL},
@@ -1073,6 +1089,12 @@ void hl_debugfs_add_device(struct hl_device *hdev)
dev_entry,
&hl_stop_on_err_fops);
+ debugfs_create_file("dump_security_violations",
+ 0644,
+ dev_entry->root,
+ dev_entry,
+ &hl_security_violations_fops);
+
for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
ent = debugfs_create_file(hl_debugfs_list[i].name,
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index d3c73fc7baf7..454ef3bfe2e7 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -850,6 +850,7 @@ enum div_select_defs {
* @collective_wait_create_jobs: allocate collective wait cs jobs
* @scramble_vaddr: Routine to scramble the virtual address prior of mapping it
* in the MMU.
+ * @ack_protection_bits_errors: ack and dump all security violations
*/
struct hl_asic_funcs {
int (*early_init)(struct hl_device *hdev);
@@ -960,6 +961,7 @@ struct hl_asic_funcs {
struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id,
u32 collective_engine_id);
u64 (*scramble_vaddr)(struct hl_device *hdev, u64 virt_addr);
+ void (*ack_protection_bits_errors)(struct hl_device *hdev);
};
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 126650e3a9ad..36e2cc22d108 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -8545,7 +8545,8 @@ static const struct hl_asic_funcs gaudi_funcs = {
.get_device_time = gaudi_get_device_time,
.collective_wait_init_cs = gaudi_collective_wait_init_cs,
.collective_wait_create_jobs = gaudi_collective_wait_create_jobs,
- .scramble_vaddr = hl_mmu_scramble_vaddr
+ .scramble_vaddr = hl_mmu_scramble_vaddr,
+ .ack_protection_bits_errors = gaudi_ack_protection_bits_errors
};
/**
diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h
index 78830443341d..50bb4ad570fd 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -335,6 +335,7 @@ struct gaudi_device {
};
void gaudi_init_security(struct hl_device *hdev);
+void gaudi_ack_protection_bits_errors(struct hl_device *hdev);
void gaudi_add_device_attr(struct hl_device *hdev,
struct attribute_group *dev_attr_grp);
void gaudi_set_pll_profile(struct hl_device *hdev, enum hl_pll_frequency freq);
diff --git a/drivers/misc/habanalabs/gaudi/gaudi_security.c b/drivers/misc/habanalabs/gaudi/gaudi_security.c
index e10181692d0b..7085f45814ae 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi_security.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi_security.c
@@ -13052,3 +13052,8 @@ void gaudi_init_security(struct hl_device *hdev)
gaudi_init_protection_bits(hdev);
}
+
+void gaudi_ack_protection_bits_errors(struct hl_device *hdev)
+{
+
+}
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 86da0401b6ea..b3d530e83e6b 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5456,7 +5456,8 @@ static const struct hl_asic_funcs goya_funcs = {
.get_device_time = goya_get_device_time,
.collective_wait_init_cs = goya_collective_wait_init_cs,
.collective_wait_create_jobs = goya_collective_wait_create_jobs,
- .scramble_vaddr = hl_mmu_scramble_vaddr
+ .scramble_vaddr = hl_mmu_scramble_vaddr,
+ .ack_protection_bits_errors = goya_ack_protection_bits_errors
};
/*
diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h
index 8b3408211af6..23fe099ed218 100644
--- a/drivers/misc/habanalabs/goya/goyaP.h
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@@ -173,6 +173,7 @@ void goya_init_mme_qmans(struct hl_device *hdev);
void goya_init_tpc_qmans(struct hl_device *hdev);
int goya_init_cpu_queues(struct hl_device *hdev);
void goya_init_security(struct hl_device *hdev);
+void goya_ack_protection_bits_errors(struct hl_device *hdev);
int goya_late_init(struct hl_device *hdev);
void goya_late_fini(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/goya/goya_security.c b/drivers/misc/habanalabs/goya/goya_security.c
index 14701836f92b..14c3bae3ccdc 100644
--- a/drivers/misc/habanalabs/goya/goya_security.c
+++ b/drivers/misc/habanalabs/goya/goya_security.c
@@ -3120,3 +3120,8 @@ void goya_init_security(struct hl_device *hdev)
goya_init_protection_bits(hdev);
}
+
+void goya_ack_protection_bits_errors(struct hl_device *hdev)
+{
+
+}
--
2.25.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
end of thread, other threads:[~2021-01-12 19:10 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-12 19:09 [PATCH 1/4] habanalabs: Use 'dma_set_mask_and_coherent()' Oded Gabbay
2021-01-12 19:09 ` [PATCH 2/4] habanalabs/gaudi: print sync manager SEI interrupt info Oded Gabbay
2021-01-12 19:09 ` [PATCH 3/4] habanalabs: ignore F/W BMC errors in case no BMC present Oded Gabbay
2021-01-12 19:09 ` [PATCH 4/4] habanalabs: add security violations dump to debugfs Oded Gabbay
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).