linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/5] habanalabs: add debugfs write64/read64
@ 2020-02-07  8:15 Oded Gabbay
  2020-02-07  8:15 ` [PATCH 2/5] habanalabs: ratelimit error prints of IRQs Oded Gabbay
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Oded Gabbay @ 2020-02-07  8:15 UTC (permalink / raw)
  To: linux-kernel, oshpigelman, ttayar; +Cc: gregkh, Moti Haimovski

From: Moti Haimovski <mhaimovski@habana.ai>

Allow debug user to write/read 64-bit data through debugfs.
This will expedite the dump process of the (large) internal
memories of the device done during debug.

Signed-off-by: Moti Haimovski <mhaimovski@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 .../ABI/testing/debugfs-driver-habanalabs     | 14 +++
 drivers/misc/habanalabs/debugfs.c             | 71 ++++++++++++++
 drivers/misc/habanalabs/goya/goya.c           | 92 +++++++++++++++++++
 drivers/misc/habanalabs/habanalabs.h          |  2 +
 4 files changed, 179 insertions(+)

diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs
index f0ac14b70ecb..a73601c5121e 100644
--- a/Documentation/ABI/testing/debugfs-driver-habanalabs
+++ b/Documentation/ABI/testing/debugfs-driver-habanalabs
@@ -43,6 +43,20 @@ Description:    Allows the root user to read or write directly through the
                 If the IOMMU is disabled, it also allows the root user to read
                 or write from the host a device VA of a host mapped memory
 
+What:           /sys/kernel/debug/habanalabs/hl<n>/data64
+Date:           Jan 2020
+KernelVersion:  5.6
+Contact:        oded.gabbay@gmail.com
+Description:    Allows the root user to read or write 64 bit data directly
+                through the device's PCI bar. Writing to this file generates a
+                write transaction while reading from the file generates a read
+                transaction. This custom interface is needed (instead of using
+                the generic Linux user-space PCI mapping) because the DDR bar
+                is very small compared to the DDR memory and only the driver can
+                move the bar before and after the transaction.
+                If the IOMMU is disabled, it also allows the root user to read
+                or write from the host a device VA of a host mapped memory
+
 What:           /sys/kernel/debug/habanalabs/hl<n>/device
 Date:           Jan 2019
 KernelVersion:  5.1
diff --git a/drivers/misc/habanalabs/debugfs.c b/drivers/misc/habanalabs/debugfs.c
index 599d17dfd542..756d36ed5d95 100644
--- a/drivers/misc/habanalabs/debugfs.c
+++ b/drivers/misc/habanalabs/debugfs.c
@@ -710,6 +710,65 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf,
 	return count;
 }
 
+static ssize_t hl_data_read64(struct file *f, char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+	struct hl_device *hdev = entry->hdev;
+	char tmp_buf[32];
+	u64 addr = entry->addr;
+	u64 val;
+	ssize_t rc;
+
+	if (*ppos)
+		return 0;
+
+	if (hl_is_device_va(hdev, addr)) {
+		rc = device_va_to_pa(hdev, addr, &addr);
+		if (rc)
+			return rc;
+	}
+
+	rc = hdev->asic_funcs->debugfs_read64(hdev, addr, &val);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to read from 0x%010llx\n", addr);
+		return rc;
+	}
+
+	sprintf(tmp_buf, "0x%016llx\n", val);
+	return simple_read_from_buffer(buf, count, ppos, tmp_buf,
+			strlen(tmp_buf));
+}
+
+static ssize_t hl_data_write64(struct file *f, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+	struct hl_device *hdev = entry->hdev;
+	u64 addr = entry->addr;
+	u64 value;
+	ssize_t rc;
+
+	rc = kstrtoull_from_user(buf, count, 16, &value);
+	if (rc)
+		return rc;
+
+	if (hl_is_device_va(hdev, addr)) {
+		rc = device_va_to_pa(hdev, addr, &addr);
+		if (rc)
+			return rc;
+	}
+
+	rc = hdev->asic_funcs->debugfs_write64(hdev, addr, value);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to write 0x%016llx to 0x%010llx\n",
+			value, addr);
+		return rc;
+	}
+
+	return count;
+}
+
 static ssize_t hl_get_power_state(struct file *f, char __user *buf,
 		size_t count, loff_t *ppos)
 {
@@ -917,6 +976,12 @@ static const struct file_operations hl_data32b_fops = {
 	.write = hl_data_write32
 };
 
+static const struct file_operations hl_data64b_fops = {
+	.owner = THIS_MODULE,
+	.read = hl_data_read64,
+	.write = hl_data_write64
+};
+
 static const struct file_operations hl_i2c_data_fops = {
 	.owner = THIS_MODULE,
 	.read = hl_i2c_data_read,
@@ -1030,6 +1095,12 @@ void hl_debugfs_add_device(struct hl_device *hdev)
 				dev_entry,
 				&hl_data32b_fops);
 
+	debugfs_create_file("data64",
+				0644,
+				dev_entry->root,
+				dev_entry,
+				&hl_data64b_fops);
+
 	debugfs_create_file("set_power_state",
 				0200,
 				dev_entry->root,
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index f634e9c5cad9..0b6567b48622 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -4180,6 +4180,96 @@ static int goya_debugfs_write32(struct hl_device *hdev, u64 addr, u32 val)
 	return rc;
 }
 
+static int goya_debugfs_read64(struct hl_device *hdev, u64 addr, u64 *val)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u64 ddr_bar_addr;
+	int rc = 0;
+
+	if ((addr >= CFG_BASE) && (addr <= CFG_BASE + CFG_SIZE - sizeof(u64))) {
+		u32 val_l = RREG32(addr - CFG_BASE);
+		u32 val_h = RREG32(addr + sizeof(u32) - CFG_BASE);
+
+		*val = (((u64) val_h) << 32) | val_l;
+
+	} else if ((addr >= SRAM_BASE_ADDR) &&
+			(addr <= SRAM_BASE_ADDR + SRAM_SIZE - sizeof(u64))) {
+
+		*val = readq(hdev->pcie_bar[SRAM_CFG_BAR_ID] +
+				(addr - SRAM_BASE_ADDR));
+
+	} else if ((addr >= DRAM_PHYS_BASE) &&
+		   (addr <=
+		    DRAM_PHYS_BASE + hdev->asic_prop.dram_size - sizeof(u64))) {
+
+		u64 bar_base_addr = DRAM_PHYS_BASE +
+				(addr & ~(prop->dram_pci_bar_size - 0x1ull));
+
+		ddr_bar_addr = goya_set_ddr_bar_base(hdev, bar_base_addr);
+		if (ddr_bar_addr != U64_MAX) {
+			*val = readq(hdev->pcie_bar[DDR_BAR_ID] +
+						(addr - bar_base_addr));
+
+			ddr_bar_addr = goya_set_ddr_bar_base(hdev,
+							ddr_bar_addr);
+		}
+		if (ddr_bar_addr == U64_MAX)
+			rc = -EIO;
+
+	} else if (addr >= HOST_PHYS_BASE && !iommu_present(&pci_bus_type)) {
+		*val = *(u64 *) phys_to_virt(addr - HOST_PHYS_BASE);
+
+	} else {
+		rc = -EFAULT;
+	}
+
+	return rc;
+}
+
+static int goya_debugfs_write64(struct hl_device *hdev, u64 addr, u64 val)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u64 ddr_bar_addr;
+	int rc = 0;
+
+	if ((addr >= CFG_BASE) && (addr <= CFG_BASE + CFG_SIZE - sizeof(u64))) {
+		WREG32(addr - CFG_BASE, lower_32_bits(val));
+		WREG32(addr + sizeof(u32) - CFG_BASE, upper_32_bits(val));
+
+	} else if ((addr >= SRAM_BASE_ADDR) &&
+			(addr <= SRAM_BASE_ADDR + SRAM_SIZE - sizeof(u64))) {
+
+		writeq(val, hdev->pcie_bar[SRAM_CFG_BAR_ID] +
+					(addr - SRAM_BASE_ADDR));
+
+	} else if ((addr >= DRAM_PHYS_BASE) &&
+		   (addr <=
+		    DRAM_PHYS_BASE + hdev->asic_prop.dram_size - sizeof(u64))) {
+
+		u64 bar_base_addr = DRAM_PHYS_BASE +
+				(addr & ~(prop->dram_pci_bar_size - 0x1ull));
+
+		ddr_bar_addr = goya_set_ddr_bar_base(hdev, bar_base_addr);
+		if (ddr_bar_addr != U64_MAX) {
+			writeq(val, hdev->pcie_bar[DDR_BAR_ID] +
+						(addr - bar_base_addr));
+
+			ddr_bar_addr = goya_set_ddr_bar_base(hdev,
+							ddr_bar_addr);
+		}
+		if (ddr_bar_addr == U64_MAX)
+			rc = -EIO;
+
+	} else if (addr >= HOST_PHYS_BASE && !iommu_present(&pci_bus_type)) {
+		*(u64 *) phys_to_virt(addr - HOST_PHYS_BASE) = val;
+
+	} else {
+		rc = -EFAULT;
+	}
+
+	return rc;
+}
+
 static u64 goya_read_pte(struct hl_device *hdev, u64 addr)
 {
 	struct goya_device *goya = hdev->asic_specific;
@@ -5186,6 +5276,8 @@ static const struct hl_asic_funcs goya_funcs = {
 	.restore_phase_topology = goya_restore_phase_topology,
 	.debugfs_read32 = goya_debugfs_read32,
 	.debugfs_write32 = goya_debugfs_write32,
+	.debugfs_read64 = goya_debugfs_read64,
+	.debugfs_write64 = goya_debugfs_write64,
 	.add_device_attr = goya_add_device_attr,
 	.handle_eqe = goya_handle_eqe,
 	.set_pll_profile = goya_set_pll_profile,
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 954906292c00..4ef8cf23d099 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -582,6 +582,8 @@ struct hl_asic_funcs {
 	void (*restore_phase_topology)(struct hl_device *hdev);
 	int (*debugfs_read32)(struct hl_device *hdev, u64 addr, u32 *val);
 	int (*debugfs_write32)(struct hl_device *hdev, u64 addr, u32 val);
+	int (*debugfs_read64)(struct hl_device *hdev, u64 addr, u64 *val);
+	int (*debugfs_write64)(struct hl_device *hdev, u64 addr, u64 val);
 	void (*add_device_attr)(struct hl_device *hdev,
 				struct attribute_group *dev_attr_grp);
 	void (*handle_eqe)(struct hl_device *hdev,
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/5] habanalabs: ratelimit error prints of IRQs
  2020-02-07  8:15 [PATCH 1/5] habanalabs: add debugfs write64/read64 Oded Gabbay
@ 2020-02-07  8:15 ` Oded Gabbay
  2020-02-09  6:33   ` Omer Shpigelman
  2020-02-07  8:15 ` [PATCH 3/5] habanalabs: support temperature offset via sysfs Oded Gabbay
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 6+ messages in thread
From: Oded Gabbay @ 2020-02-07  8:15 UTC (permalink / raw)
  To: linux-kernel, oshpigelman, ttayar; +Cc: gregkh

The compute engines can perform millions of transactions per second. If
there is a bug in the S/W stack, we could get a lot of interrupts and spam
the kernel log. Therefore, ratelimit these prints

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/goya/goya.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 0b6567b48622..19bce06e5fc0 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -4480,22 +4480,22 @@ static void goya_get_event_desc(u16 event_type, char *desc, size_t size)
 static void goya_print_razwi_info(struct hl_device *hdev)
 {
 	if (RREG32(mmDMA_MACRO_RAZWI_LBW_WT_VLD)) {
-		dev_err(hdev->dev, "Illegal write to LBW\n");
+		dev_err_ratelimited(hdev->dev, "Illegal write to LBW\n");
 		WREG32(mmDMA_MACRO_RAZWI_LBW_WT_VLD, 0);
 	}
 
 	if (RREG32(mmDMA_MACRO_RAZWI_LBW_RD_VLD)) {
-		dev_err(hdev->dev, "Illegal read from LBW\n");
+		dev_err_ratelimited(hdev->dev, "Illegal read from LBW\n");
 		WREG32(mmDMA_MACRO_RAZWI_LBW_RD_VLD, 0);
 	}
 
 	if (RREG32(mmDMA_MACRO_RAZWI_HBW_WT_VLD)) {
-		dev_err(hdev->dev, "Illegal write to HBW\n");
+		dev_err_ratelimited(hdev->dev, "Illegal write to HBW\n");
 		WREG32(mmDMA_MACRO_RAZWI_HBW_WT_VLD, 0);
 	}
 
 	if (RREG32(mmDMA_MACRO_RAZWI_HBW_RD_VLD)) {
-		dev_err(hdev->dev, "Illegal read from HBW\n");
+		dev_err_ratelimited(hdev->dev, "Illegal read from HBW\n");
 		WREG32(mmDMA_MACRO_RAZWI_HBW_RD_VLD, 0);
 	}
 }
@@ -4515,7 +4515,8 @@ static void goya_print_mmu_error_info(struct hl_device *hdev)
 		addr <<= 32;
 		addr |= RREG32(mmMMU_PAGE_ERROR_CAPTURE_VA);
 
-		dev_err(hdev->dev, "MMU page fault on va 0x%llx\n", addr);
+		dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n",
+					addr);
 
 		WREG32(mmMMU_PAGE_ERROR_CAPTURE, 0);
 	}
@@ -4527,7 +4528,7 @@ static void goya_print_irq_info(struct hl_device *hdev, u16 event_type,
 	char desc[20] = "";
 
 	goya_get_event_desc(event_type, desc, sizeof(desc));
-	dev_err(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n",
+	dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n",
 		event_type, desc);
 
 	if (razwi) {
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/5] habanalabs: support temperature offset via sysfs
  2020-02-07  8:15 [PATCH 1/5] habanalabs: add debugfs write64/read64 Oded Gabbay
  2020-02-07  8:15 ` [PATCH 2/5] habanalabs: ratelimit error prints of IRQs Oded Gabbay
@ 2020-02-07  8:15 ` Oded Gabbay
  2020-02-07  8:15 ` [PATCH 4/5] habanalabs: modify the return values of hl_read/write routines Oded Gabbay
  2020-02-07  8:15 ` [PATCH 5/5] habanalabs: provide historical maximum of various sensors Oded Gabbay
  3 siblings, 0 replies; 6+ messages in thread
From: Oded Gabbay @ 2020-02-07  8:15 UTC (permalink / raw)
  To: linux-kernel, oshpigelman, ttayar; +Cc: gregkh, Moti Haimovski

From: Moti Haimovski <mhaimovski@habana.ai>

This commit adds support for offsetting the temperatures reading
by a specified value as defined in
https://www.kernel.org/doc/Documentation/hwmon/sysfs-interface
using the standard sysfs defined for hwmon.
This is required by system administrators to inject errors to test
their monitoring applications in data centers.

Signed-off-by: Moti Haimovski <mhaimovski@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/habanalabs.h       |  2 ++
 drivers/misc/habanalabs/hwmon.c            | 37 ++++++++++++++++++++++
 drivers/misc/habanalabs/include/armcp_if.h | 13 +++++++-
 3 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 4ef8cf23d099..4de12d3ff836 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -1610,6 +1610,8 @@ int hl_pci_set_dma_mask(struct hl_device *hdev, u8 dma_mask);
 long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr);
 void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq);
 long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr);
+int hl_set_temperature(struct hl_device *hdev,
+			int sensor_index, u32 attr, long value);
 long hl_get_voltage(struct hl_device *hdev, int sensor_index, u32 attr);
 long hl_get_current(struct hl_device *hdev, int sensor_index, u32 attr);
 long hl_get_fan_speed(struct hl_device *hdev, int sensor_index, u32 attr);
diff --git a/drivers/misc/habanalabs/hwmon.c b/drivers/misc/habanalabs/hwmon.c
index 7be4bace9b4f..70088fdb0a5b 100644
--- a/drivers/misc/habanalabs/hwmon.c
+++ b/drivers/misc/habanalabs/hwmon.c
@@ -125,6 +125,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		case hwmon_temp_crit:
 		case hwmon_temp_max_hyst:
 		case hwmon_temp_crit_hyst:
+		case hwmon_temp_offset:
 			break;
 		default:
 			return -EINVAL;
@@ -192,6 +193,15 @@ static int hl_write(struct device *dev, enum hwmon_sensor_types type,
 		return -ENODEV;
 
 	switch (type) {
+	case hwmon_temp:
+		switch (attr) {
+		case hwmon_temp_offset:
+			break;
+		default:
+			return -EINVAL;
+		}
+		hl_set_temperature(hdev, channel, attr, val);
+		break;
 	case hwmon_pwm:
 		switch (attr) {
 		case hwmon_pwm_input:
@@ -220,6 +230,8 @@ static umode_t hl_is_visible(const void *data, enum hwmon_sensor_types type,
 		case hwmon_temp_crit:
 		case hwmon_temp_crit_hyst:
 			return 0444;
+		case hwmon_temp_offset:
+			return 0644;
 		}
 		break;
 	case hwmon_in:
@@ -291,6 +303,31 @@ long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr)
 	return result;
 }
 
+int hl_set_temperature(struct hl_device *hdev,
+			int sensor_index, u32 attr, long value)
+{
+	struct armcp_packet pkt;
+	int rc;
+
+	memset(&pkt, 0, sizeof(pkt));
+
+	pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEMPERATURE_SET <<
+				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.sensor_index = __cpu_to_le16(sensor_index);
+	pkt.type = __cpu_to_le16(attr);
+	pkt.value = __cpu_to_le64(value);
+
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+						SENSORS_PKT_TIMEOUT, NULL);
+
+	if (rc)
+		dev_err(hdev->dev,
+			"Failed to set temperature of sensor %d, error %d\n",
+			sensor_index, rc);
+
+	return rc;
+}
+
 long hl_get_voltage(struct hl_device *hdev, int sensor_index, u32 attr)
 {
 	struct armcp_packet pkt;
diff --git a/drivers/misc/habanalabs/include/armcp_if.h b/drivers/misc/habanalabs/include/armcp_if.h
index e4c6699a1868..014549eaf919 100644
--- a/drivers/misc/habanalabs/include/armcp_if.h
+++ b/drivers/misc/habanalabs/include/armcp_if.h
@@ -189,6 +189,10 @@ enum pq_init_status {
  *       ArmCP to write to the structure, to prevent data corruption in case of
  *       mismatched driver/FW versions.
  *
+ * ARMCP_PACKET_TEMPERATURE_SET -
+ *       Set the value of the offset property of a specified thermal sensor.
+ *       The packet's arguments specify the desired sensor and the field to
+ *       set.
  */
 
 enum armcp_packet_id {
@@ -214,6 +218,8 @@ enum armcp_packet_id {
 	ARMCP_PACKET_MAX_POWER_GET,		/* sysfs */
 	ARMCP_PACKET_MAX_POWER_SET,		/* sysfs */
 	ARMCP_PACKET_EEPROM_DATA_GET,		/* sysfs */
+	ARMCP_RESERVED,
+	ARMCP_PACKET_TEMPERATURE_SET,		/* sysfs */
 };
 
 #define ARMCP_PACKET_FENCE_VAL	0xFE8CE7A5
@@ -271,12 +277,17 @@ enum armcp_packet_rc {
 	armcp_packet_fault
 };
 
+/*
+ * armcp_temp_type should adhere to hwmon_temp_attributes
+ * defined in Linux kernel hwmon.h file
+ */
 enum armcp_temp_type {
 	armcp_temp_input,
 	armcp_temp_max = 6,
 	armcp_temp_max_hyst,
 	armcp_temp_crit,
-	armcp_temp_crit_hyst
+	armcp_temp_crit_hyst,
+	armcp_temp_offset = 19
 };
 
 enum armcp_in_attributes {
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/5] habanalabs: modify the return values of hl_read/write routines
  2020-02-07  8:15 [PATCH 1/5] habanalabs: add debugfs write64/read64 Oded Gabbay
  2020-02-07  8:15 ` [PATCH 2/5] habanalabs: ratelimit error prints of IRQs Oded Gabbay
  2020-02-07  8:15 ` [PATCH 3/5] habanalabs: support temperature offset via sysfs Oded Gabbay
@ 2020-02-07  8:15 ` Oded Gabbay
  2020-02-07  8:15 ` [PATCH 5/5] habanalabs: provide historical maximum of various sensors Oded Gabbay
  3 siblings, 0 replies; 6+ messages in thread
From: Oded Gabbay @ 2020-02-07  8:15 UTC (permalink / raw)
  To: linux-kernel, oshpigelman, ttayar; +Cc: gregkh, Moti Haimovski

From: Moti Haimovski <mhaimovski@habana.ai>

The hl read and write routines implement the hwmon_ops read and write
interface routines respectively.
These routines are expected to return a completion status when called,
which was not the case until this commit.
This commit modifies these routines to return 0 upon success and a
negative error value upon failure.

Signed-off-by: Moti Haimovski <mhaimovski@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/habanalabs.h | 17 +++++---
 drivers/misc/habanalabs/hwmon.c      | 63 ++++++++++++++--------------
 2 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 4de12d3ff836..9472da3ef847 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -1609,13 +1609,18 @@ int hl_pci_set_dma_mask(struct hl_device *hdev, u8 dma_mask);
 
 long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr);
 void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq);
-long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr);
+int hl_get_temperature(struct hl_device *hdev,
+		       int sensor_index, u32 attr, long *value);
 int hl_set_temperature(struct hl_device *hdev,
-			int sensor_index, u32 attr, long value);
-long hl_get_voltage(struct hl_device *hdev, int sensor_index, u32 attr);
-long hl_get_current(struct hl_device *hdev, int sensor_index, u32 attr);
-long hl_get_fan_speed(struct hl_device *hdev, int sensor_index, u32 attr);
-long hl_get_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr);
+		       int sensor_index, u32 attr, long value);
+int hl_get_voltage(struct hl_device *hdev,
+		   int sensor_index, u32 attr, long *value);
+int hl_get_current(struct hl_device *hdev,
+		   int sensor_index, u32 attr, long *value);
+int hl_get_fan_speed(struct hl_device *hdev,
+		     int sensor_index, u32 attr, long *value);
+int hl_get_pwm_info(struct hl_device *hdev,
+		    int sensor_index, u32 attr, long *value);
 void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
 			long value);
 u64 hl_get_max_power(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/hwmon.c b/drivers/misc/habanalabs/hwmon.c
index 70088fdb0a5b..3539190b1caa 100644
--- a/drivers/misc/habanalabs/hwmon.c
+++ b/drivers/misc/habanalabs/hwmon.c
@@ -113,6 +113,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 			u32 attr, int channel, long *val)
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
+	int rc;
 
 	if (hl_device_disabled_or_in_reset(hdev))
 		return -ENODEV;
@@ -131,7 +132,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 			return -EINVAL;
 		}
 
-		*val = hl_get_temperature(hdev, channel, attr);
+		rc = hl_get_temperature(hdev, channel, attr, val);
 		break;
 	case hwmon_in:
 		switch (attr) {
@@ -143,7 +144,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 			return -EINVAL;
 		}
 
-		*val = hl_get_voltage(hdev, channel, attr);
+		rc = hl_get_voltage(hdev, channel, attr, val);
 		break;
 	case hwmon_curr:
 		switch (attr) {
@@ -155,7 +156,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 			return -EINVAL;
 		}
 
-		*val = hl_get_current(hdev, channel, attr);
+		rc = hl_get_current(hdev, channel, attr, val);
 		break;
 	case hwmon_fan:
 		switch (attr) {
@@ -166,7 +167,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		default:
 			return -EINVAL;
 		}
-		*val = hl_get_fan_speed(hdev, channel, attr);
+		rc = hl_get_fan_speed(hdev, channel, attr, val);
 		break;
 	case hwmon_pwm:
 		switch (attr) {
@@ -176,12 +177,12 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		default:
 			return -EINVAL;
 		}
-		*val = hl_get_pwm_info(hdev, channel, attr);
+		rc = hl_get_pwm_info(hdev, channel, attr, val);
 		break;
 	default:
 		return -EINVAL;
 	}
-	return 0;
+	return rc;
 }
 
 static int hl_write(struct device *dev, enum hwmon_sensor_types type,
@@ -277,10 +278,10 @@ static const struct hwmon_ops hl_hwmon_ops = {
 	.write = hl_write
 };
 
-long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr)
+int hl_get_temperature(struct hl_device *hdev,
+			int sensor_index, u32 attr, long *value)
 {
 	struct armcp_packet pkt;
-	long result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -291,16 +292,16 @@ long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr)
 	pkt.type = __cpu_to_le16(attr);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-			SENSORS_PKT_TIMEOUT, &result);
+			SENSORS_PKT_TIMEOUT, value);
 
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to get temperature from sensor %d, error %d\n",
 			sensor_index, rc);
-		result = 0;
+		*value = 0;
 	}
 
-	return result;
+	return rc;
 }
 
 int hl_set_temperature(struct hl_device *hdev,
@@ -328,10 +329,10 @@ int hl_set_temperature(struct hl_device *hdev,
 	return rc;
 }
 
-long hl_get_voltage(struct hl_device *hdev, int sensor_index, u32 attr)
+int hl_get_voltage(struct hl_device *hdev,
+			int sensor_index, u32 attr, long *value)
 {
 	struct armcp_packet pkt;
-	long result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -342,22 +343,22 @@ long hl_get_voltage(struct hl_device *hdev, int sensor_index, u32 attr)
 	pkt.type = __cpu_to_le16(attr);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-					SENSORS_PKT_TIMEOUT, &result);
+					SENSORS_PKT_TIMEOUT, value);
 
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to get voltage from sensor %d, error %d\n",
 			sensor_index, rc);
-		result = 0;
+		*value = 0;
 	}
 
-	return result;
+	return rc;
 }
 
-long hl_get_current(struct hl_device *hdev, int sensor_index, u32 attr)
+int hl_get_current(struct hl_device *hdev,
+			int sensor_index, u32 attr, long *value)
 {
 	struct armcp_packet pkt;
-	long result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -368,22 +369,22 @@ long hl_get_current(struct hl_device *hdev, int sensor_index, u32 attr)
 	pkt.type = __cpu_to_le16(attr);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-					SENSORS_PKT_TIMEOUT, &result);
+					SENSORS_PKT_TIMEOUT, value);
 
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to get current from sensor %d, error %d\n",
 			sensor_index, rc);
-		result = 0;
+		*value = 0;
 	}
 
-	return result;
+	return rc;
 }
 
-long hl_get_fan_speed(struct hl_device *hdev, int sensor_index, u32 attr)
+int hl_get_fan_speed(struct hl_device *hdev,
+			int sensor_index, u32 attr, long *value)
 {
 	struct armcp_packet pkt;
-	long result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -394,22 +395,22 @@ long hl_get_fan_speed(struct hl_device *hdev, int sensor_index, u32 attr)
 	pkt.type = __cpu_to_le16(attr);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-					SENSORS_PKT_TIMEOUT, &result);
+					SENSORS_PKT_TIMEOUT, value);
 
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to get fan speed from sensor %d, error %d\n",
 			sensor_index, rc);
-		result = 0;
+		*value = 0;
 	}
 
-	return result;
+	return rc;
 }
 
-long hl_get_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr)
+int hl_get_pwm_info(struct hl_device *hdev,
+			int sensor_index, u32 attr, long *value)
 {
 	struct armcp_packet pkt;
-	long result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -420,16 +421,16 @@ long hl_get_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr)
 	pkt.type = __cpu_to_le16(attr);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-					SENSORS_PKT_TIMEOUT, &result);
+					SENSORS_PKT_TIMEOUT, value);
 
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to get pwm info from sensor %d, error %d\n",
 			sensor_index, rc);
-		result = 0;
+		*value = 0;
 	}
 
-	return result;
+	return rc;
 }
 
 void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 5/5] habanalabs: provide historical maximum of various sensors
  2020-02-07  8:15 [PATCH 1/5] habanalabs: add debugfs write64/read64 Oded Gabbay
                   ` (2 preceding siblings ...)
  2020-02-07  8:15 ` [PATCH 4/5] habanalabs: modify the return values of hl_read/write routines Oded Gabbay
@ 2020-02-07  8:15 ` Oded Gabbay
  3 siblings, 0 replies; 6+ messages in thread
From: Oded Gabbay @ 2020-02-07  8:15 UTC (permalink / raw)
  To: linux-kernel, oshpigelman, ttayar; +Cc: gregkh, Christine Gharzuzi

From: Christine Gharzuzi <cgharzuzi@habana.ai>

Add support for hwmon_in_highest, hwmon_temp_highest and hwmon_curr_highest
attributes. These attributes retrieve the historical maximum voltage,
temperature and current that were sampled, respectively.

Signed-off-by: Christine Gharzuzi <cgharzuzi@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/hwmon.c            | 6 ++++++
 drivers/misc/habanalabs/include/armcp_if.h | 9 ++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/hwmon.c b/drivers/misc/habanalabs/hwmon.c
index 3539190b1caa..a21a26e07c3b 100644
--- a/drivers/misc/habanalabs/hwmon.c
+++ b/drivers/misc/habanalabs/hwmon.c
@@ -127,6 +127,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		case hwmon_temp_max_hyst:
 		case hwmon_temp_crit_hyst:
 		case hwmon_temp_offset:
+		case hwmon_temp_highest:
 			break;
 		default:
 			return -EINVAL;
@@ -139,6 +140,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		case hwmon_in_input:
 		case hwmon_in_min:
 		case hwmon_in_max:
+		case hwmon_in_highest:
 			break;
 		default:
 			return -EINVAL;
@@ -151,6 +153,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		case hwmon_curr_input:
 		case hwmon_curr_min:
 		case hwmon_curr_max:
+		case hwmon_curr_highest:
 			break;
 		default:
 			return -EINVAL;
@@ -230,6 +233,7 @@ static umode_t hl_is_visible(const void *data, enum hwmon_sensor_types type,
 		case hwmon_temp_max_hyst:
 		case hwmon_temp_crit:
 		case hwmon_temp_crit_hyst:
+		case hwmon_temp_highest:
 			return 0444;
 		case hwmon_temp_offset:
 			return 0644;
@@ -240,6 +244,7 @@ static umode_t hl_is_visible(const void *data, enum hwmon_sensor_types type,
 		case hwmon_in_input:
 		case hwmon_in_min:
 		case hwmon_in_max:
+		case hwmon_in_highest:
 			return 0444;
 		}
 		break;
@@ -248,6 +253,7 @@ static umode_t hl_is_visible(const void *data, enum hwmon_sensor_types type,
 		case hwmon_curr_input:
 		case hwmon_curr_min:
 		case hwmon_curr_max:
+		case hwmon_curr_highest:
 			return 0444;
 		}
 		break;
diff --git a/drivers/misc/habanalabs/include/armcp_if.h b/drivers/misc/habanalabs/include/armcp_if.h
index 014549eaf919..bdd0a4c3a9cf 100644
--- a/drivers/misc/habanalabs/include/armcp_if.h
+++ b/drivers/misc/habanalabs/include/armcp_if.h
@@ -287,19 +287,22 @@ enum armcp_temp_type {
 	armcp_temp_max_hyst,
 	armcp_temp_crit,
 	armcp_temp_crit_hyst,
-	armcp_temp_offset = 19
+	armcp_temp_offset = 19,
+	armcp_temp_highest = 22
 };
 
 enum armcp_in_attributes {
 	armcp_in_input,
 	armcp_in_min,
-	armcp_in_max
+	armcp_in_max,
+	armcp_in_highest = 7
 };
 
 enum armcp_curr_attributes {
 	armcp_curr_input,
 	armcp_curr_min,
-	armcp_curr_max
+	armcp_curr_max,
+	armcp_curr_highest = 7
 };
 
 enum armcp_fan_attributes {
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* RE: [PATCH 2/5] habanalabs: ratelimit error prints of IRQs
  2020-02-07  8:15 ` [PATCH 2/5] habanalabs: ratelimit error prints of IRQs Oded Gabbay
@ 2020-02-09  6:33   ` Omer Shpigelman
  0 siblings, 0 replies; 6+ messages in thread
From: Omer Shpigelman @ 2020-02-09  6:33 UTC (permalink / raw)
  To: Oded Gabbay, linux-kernel, Tomer Tayar; +Cc: gregkh

On Fri, Feb 7, 2020 at 10:15 AM Oded Gabbay <oded.gabbay@gmail.com> wrote:
> The compute engines can perform millions of transactions per second.
> If there is a bug in the S/W stack, we could get a lot of interrupts and
> spam the kernel log. Therefore, ratelimit these prints
>
> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>

Reviewed-by: Omer Shpigelman <oshpigelman@habana.ai>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-02-09  6:34 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-07  8:15 [PATCH 1/5] habanalabs: add debugfs write64/read64 Oded Gabbay
2020-02-07  8:15 ` [PATCH 2/5] habanalabs: ratelimit error prints of IRQs Oded Gabbay
2020-02-09  6:33   ` Omer Shpigelman
2020-02-07  8:15 ` [PATCH 3/5] habanalabs: support temperature offset via sysfs Oded Gabbay
2020-02-07  8:15 ` [PATCH 4/5] habanalabs: modify the return values of hl_read/write routines Oded Gabbay
2020-02-07  8:15 ` [PATCH 5/5] habanalabs: provide historical maximum of various sensors Oded Gabbay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).