All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] habanalabs: add debugfs write64/read64
@ 2020-02-07  8:15 Oded Gabbay
  2020-02-07  8:15 ` [PATCH 2/5] habanalabs: ratelimit error prints of IRQs Oded Gabbay
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Oded Gabbay @ 2020-02-07  8:15 UTC (permalink / raw)
  To: linux-kernel, oshpigelman, ttayar; +Cc: gregkh, Moti Haimovski

From: Moti Haimovski <mhaimovski@habana.ai>

Allow debug user to write/read 64-bit data through debugfs.
This will expedite the dump process of the (large) internal
memories of the device done during debug.

Signed-off-by: Moti Haimovski <mhaimovski@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 .../ABI/testing/debugfs-driver-habanalabs     | 14 +++
 drivers/misc/habanalabs/debugfs.c             | 71 ++++++++++++++
 drivers/misc/habanalabs/goya/goya.c           | 92 +++++++++++++++++++
 drivers/misc/habanalabs/habanalabs.h          |  2 +
 4 files changed, 179 insertions(+)

diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs
index f0ac14b70ecb..a73601c5121e 100644
--- a/Documentation/ABI/testing/debugfs-driver-habanalabs
+++ b/Documentation/ABI/testing/debugfs-driver-habanalabs
@@ -43,6 +43,20 @@ Description:    Allows the root user to read or write directly through the
                 If the IOMMU is disabled, it also allows the root user to read
                 or write from the host a device VA of a host mapped memory
 
+What:           /sys/kernel/debug/habanalabs/hl<n>/data64
+Date:           Jan 2020
+KernelVersion:  5.6
+Contact:        oded.gabbay@gmail.com
+Description:    Allows the root user to read or write 64 bit data directly
+                through the device's PCI bar. Writing to this file generates a
+                write transaction while reading from the file generates a read
+                transaction. This custom interface is needed (instead of using
+                the generic Linux user-space PCI mapping) because the DDR bar
+                is very small compared to the DDR memory and only the driver can
+                move the bar before and after the transaction.
+                If the IOMMU is disabled, it also allows the root user to read
+                or write from the host a device VA of a host mapped memory
+
 What:           /sys/kernel/debug/habanalabs/hl<n>/device
 Date:           Jan 2019
 KernelVersion:  5.1
diff --git a/drivers/misc/habanalabs/debugfs.c b/drivers/misc/habanalabs/debugfs.c
index 599d17dfd542..756d36ed5d95 100644
--- a/drivers/misc/habanalabs/debugfs.c
+++ b/drivers/misc/habanalabs/debugfs.c
@@ -710,6 +710,65 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf,
 	return count;
 }
 
+static ssize_t hl_data_read64(struct file *f, char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+	struct hl_device *hdev = entry->hdev;
+	char tmp_buf[32];
+	u64 addr = entry->addr;
+	u64 val;
+	ssize_t rc;
+
+	if (*ppos)
+		return 0;
+
+	if (hl_is_device_va(hdev, addr)) {
+		rc = device_va_to_pa(hdev, addr, &addr);
+		if (rc)
+			return rc;
+	}
+
+	rc = hdev->asic_funcs->debugfs_read64(hdev, addr, &val);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to read from 0x%010llx\n", addr);
+		return rc;
+	}
+
+	sprintf(tmp_buf, "0x%016llx\n", val);
+	return simple_read_from_buffer(buf, count, ppos, tmp_buf,
+			strlen(tmp_buf));
+}
+
+static ssize_t hl_data_write64(struct file *f, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+	struct hl_device *hdev = entry->hdev;
+	u64 addr = entry->addr;
+	u64 value;
+	ssize_t rc;
+
+	rc = kstrtoull_from_user(buf, count, 16, &value);
+	if (rc)
+		return rc;
+
+	if (hl_is_device_va(hdev, addr)) {
+		rc = device_va_to_pa(hdev, addr, &addr);
+		if (rc)
+			return rc;
+	}
+
+	rc = hdev->asic_funcs->debugfs_write64(hdev, addr, value);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to write 0x%016llx to 0x%010llx\n",
+			value, addr);
+		return rc;
+	}
+
+	return count;
+}
+
 static ssize_t hl_get_power_state(struct file *f, char __user *buf,
 		size_t count, loff_t *ppos)
 {
@@ -917,6 +976,12 @@ static const struct file_operations hl_data32b_fops = {
 	.write = hl_data_write32
 };
 
+static const struct file_operations hl_data64b_fops = {
+	.owner = THIS_MODULE,
+	.read = hl_data_read64,
+	.write = hl_data_write64
+};
+
 static const struct file_operations hl_i2c_data_fops = {
 	.owner = THIS_MODULE,
 	.read = hl_i2c_data_read,
@@ -1030,6 +1095,12 @@ void hl_debugfs_add_device(struct hl_device *hdev)
 				dev_entry,
 				&hl_data32b_fops);
 
+	debugfs_create_file("data64",
+				0644,
+				dev_entry->root,
+				dev_entry,
+				&hl_data64b_fops);
+
 	debugfs_create_file("set_power_state",
 				0200,
 				dev_entry->root,
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index f634e9c5cad9..0b6567b48622 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -4180,6 +4180,96 @@ static int goya_debugfs_write32(struct hl_device *hdev, u64 addr, u32 val)
 	return rc;
 }
 
+static int goya_debugfs_read64(struct hl_device *hdev, u64 addr, u64 *val)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u64 ddr_bar_addr;
+	int rc = 0;
+
+	if ((addr >= CFG_BASE) && (addr <= CFG_BASE + CFG_SIZE - sizeof(u64))) {
+		u32 val_l = RREG32(addr - CFG_BASE);
+		u32 val_h = RREG32(addr + sizeof(u32) - CFG_BASE);
+
+		*val = (((u64) val_h) << 32) | val_l;
+
+	} else if ((addr >= SRAM_BASE_ADDR) &&
+			(addr <= SRAM_BASE_ADDR + SRAM_SIZE - sizeof(u64))) {
+
+		*val = readq(hdev->pcie_bar[SRAM_CFG_BAR_ID] +
+				(addr - SRAM_BASE_ADDR));
+
+	} else if ((addr >= DRAM_PHYS_BASE) &&
+		   (addr <=
+		    DRAM_PHYS_BASE + hdev->asic_prop.dram_size - sizeof(u64))) {
+
+		u64 bar_base_addr = DRAM_PHYS_BASE +
+				(addr & ~(prop->dram_pci_bar_size - 0x1ull));
+
+		ddr_bar_addr = goya_set_ddr_bar_base(hdev, bar_base_addr);
+		if (ddr_bar_addr != U64_MAX) {
+			*val = readq(hdev->pcie_bar[DDR_BAR_ID] +
+						(addr - bar_base_addr));
+
+			ddr_bar_addr = goya_set_ddr_bar_base(hdev,
+							ddr_bar_addr);
+		}
+		if (ddr_bar_addr == U64_MAX)
+			rc = -EIO;
+
+	} else if (addr >= HOST_PHYS_BASE && !iommu_present(&pci_bus_type)) {
+		*val = *(u64 *) phys_to_virt(addr - HOST_PHYS_BASE);
+
+	} else {
+		rc = -EFAULT;
+	}
+
+	return rc;
+}
+
+static int goya_debugfs_write64(struct hl_device *hdev, u64 addr, u64 val)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u64 ddr_bar_addr;
+	int rc = 0;
+
+	if ((addr >= CFG_BASE) && (addr <= CFG_BASE + CFG_SIZE - sizeof(u64))) {
+		WREG32(addr - CFG_BASE, lower_32_bits(val));
+		WREG32(addr + sizeof(u32) - CFG_BASE, upper_32_bits(val));
+
+	} else if ((addr >= SRAM_BASE_ADDR) &&
+			(addr <= SRAM_BASE_ADDR + SRAM_SIZE - sizeof(u64))) {
+
+		writeq(val, hdev->pcie_bar[SRAM_CFG_BAR_ID] +
+					(addr - SRAM_BASE_ADDR));
+
+	} else if ((addr >= DRAM_PHYS_BASE) &&
+		   (addr <=
+		    DRAM_PHYS_BASE + hdev->asic_prop.dram_size - sizeof(u64))) {
+
+		u64 bar_base_addr = DRAM_PHYS_BASE +
+				(addr & ~(prop->dram_pci_bar_size - 0x1ull));
+
+		ddr_bar_addr = goya_set_ddr_bar_base(hdev, bar_base_addr);
+		if (ddr_bar_addr != U64_MAX) {
+			writeq(val, hdev->pcie_bar[DDR_BAR_ID] +
+						(addr - bar_base_addr));
+
+			ddr_bar_addr = goya_set_ddr_bar_base(hdev,
+							ddr_bar_addr);
+		}
+		if (ddr_bar_addr == U64_MAX)
+			rc = -EIO;
+
+	} else if (addr >= HOST_PHYS_BASE && !iommu_present(&pci_bus_type)) {
+		*(u64 *) phys_to_virt(addr - HOST_PHYS_BASE) = val;
+
+	} else {
+		rc = -EFAULT;
+	}
+
+	return rc;
+}
+
 static u64 goya_read_pte(struct hl_device *hdev, u64 addr)
 {
 	struct goya_device *goya = hdev->asic_specific;
@@ -5186,6 +5276,8 @@ static const struct hl_asic_funcs goya_funcs = {
 	.restore_phase_topology = goya_restore_phase_topology,
 	.debugfs_read32 = goya_debugfs_read32,
 	.debugfs_write32 = goya_debugfs_write32,
+	.debugfs_read64 = goya_debugfs_read64,
+	.debugfs_write64 = goya_debugfs_write64,
 	.add_device_attr = goya_add_device_attr,
 	.handle_eqe = goya_handle_eqe,
 	.set_pll_profile = goya_set_pll_profile,
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 954906292c00..4ef8cf23d099 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -582,6 +582,8 @@ struct hl_asic_funcs {
 	void (*restore_phase_topology)(struct hl_device *hdev);
 	int (*debugfs_read32)(struct hl_device *hdev, u64 addr, u32 *val);
 	int (*debugfs_write32)(struct hl_device *hdev, u64 addr, u32 val);
+	int (*debugfs_read64)(struct hl_device *hdev, u64 addr, u64 *val);
+	int (*debugfs_write64)(struct hl_device *hdev, u64 addr, u64 val);
 	void (*add_device_attr)(struct hl_device *hdev,
 				struct attribute_group *dev_attr_grp);
 	void (*handle_eqe)(struct hl_device *hdev,
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/5] habanalabs: ratelimit error prints of IRQs
  2020-02-07  8:15 [PATCH 1/5] habanalabs: add debugfs write64/read64 Oded Gabbay
@ 2020-02-07  8:15 ` Oded Gabbay
  2020-02-09  6:33   ` Omer Shpigelman
  2020-02-07  8:15 ` [PATCH 3/5] habanalabs: support temperature offset via sysfs Oded Gabbay
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 6+ messages in thread
From: Oded Gabbay @ 2020-02-07  8:15 UTC (permalink / raw)
  To: linux-kernel, oshpigelman, ttayar; +Cc: gregkh

The compute engines can perform millions of transactions per second. If
there is a bug in the S/W stack, we could get a lot of interrupts and spam
the kernel log. Therefore, ratelimit these prints

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/goya/goya.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 0b6567b48622..19bce06e5fc0 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -4480,22 +4480,22 @@ static void goya_get_event_desc(u16 event_type, char *desc, size_t size)
 static void goya_print_razwi_info(struct hl_device *hdev)
 {
 	if (RREG32(mmDMA_MACRO_RAZWI_LBW_WT_VLD)) {
-		dev_err(hdev->dev, "Illegal write to LBW\n");
+		dev_err_ratelimited(hdev->dev, "Illegal write to LBW\n");
 		WREG32(mmDMA_MACRO_RAZWI_LBW_WT_VLD, 0);
 	}
 
 	if (RREG32(mmDMA_MACRO_RAZWI_LBW_RD_VLD)) {
-		dev_err(hdev->dev, "Illegal read from LBW\n");
+		dev_err_ratelimited(hdev->dev, "Illegal read from LBW\n");
 		WREG32(mmDMA_MACRO_RAZWI_LBW_RD_VLD, 0);
 	}
 
 	if (RREG32(mmDMA_MACRO_RAZWI_HBW_WT_VLD)) {
-		dev_err(hdev->dev, "Illegal write to HBW\n");
+		dev_err_ratelimited(hdev->dev, "Illegal write to HBW\n");
 		WREG32(mmDMA_MACRO_RAZWI_HBW_WT_VLD, 0);
 	}
 
 	if (RREG32(mmDMA_MACRO_RAZWI_HBW_RD_VLD)) {
-		dev_err(hdev->dev, "Illegal read from HBW\n");
+		dev_err_ratelimited(hdev->dev, "Illegal read from HBW\n");
 		WREG32(mmDMA_MACRO_RAZWI_HBW_RD_VLD, 0);
 	}
 }
@@ -4515,7 +4515,8 @@ static void goya_print_mmu_error_info(struct hl_device *hdev)
 		addr <<= 32;
 		addr |= RREG32(mmMMU_PAGE_ERROR_CAPTURE_VA);
 
-		dev_err(hdev->dev, "MMU page fault on va 0x%llx\n", addr);
+		dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n",
+					addr);
 
 		WREG32(mmMMU_PAGE_ERROR_CAPTURE, 0);
 	}
@@ -4527,7 +4528,7 @@ static void goya_print_irq_info(struct hl_device *hdev, u16 event_type,
 	char desc[20] = "";
 
 	goya_get_event_desc(event_type, desc, sizeof(desc));
-	dev_err(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n",
+	dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n",
 		event_type, desc);
 
 	if (razwi) {
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/5] habanalabs: support temperature offset via sysfs
  2020-02-07  8:15 [PATCH 1/5] habanalabs: add debugfs write64/read64 Oded Gabbay
  2020-02-07  8:15 ` [PATCH 2/5] habanalabs: ratelimit error prints of IRQs Oded Gabbay
@ 2020-02-07  8:15 ` Oded Gabbay
  2020-02-07  8:15 ` [PATCH 4/5] habanalabs: modify the return values of hl_read/write routines Oded Gabbay
  2020-02-07  8:15 ` [PATCH 5/5] habanalabs: provide historical maximum of various sensors Oded Gabbay
  3 siblings, 0 replies; 6+ messages in thread
From: Oded Gabbay @ 2020-02-07  8:15 UTC (permalink / raw)
  To: linux-kernel, oshpigelman, ttayar; +Cc: gregkh, Moti Haimovski

From: Moti Haimovski <mhaimovski@habana.ai>

This commit adds support for offsetting the temperatures reading
by a specified value as defined in
https://www.kernel.org/doc/Documentation/hwmon/sysfs-interface
using the standard sysfs defined for hwmon.
This is required by system administrators to inject errors to test
their monitoring applications in data centers.

Signed-off-by: Moti Haimovski <mhaimovski@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/habanalabs.h       |  2 ++
 drivers/misc/habanalabs/hwmon.c            | 37 ++++++++++++++++++++++
 drivers/misc/habanalabs/include/armcp_if.h | 13 +++++++-
 3 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 4ef8cf23d099..4de12d3ff836 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -1610,6 +1610,8 @@ int hl_pci_set_dma_mask(struct hl_device *hdev, u8 dma_mask);
 long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr);
 void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq);
 long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr);
+int hl_set_temperature(struct hl_device *hdev,
+			int sensor_index, u32 attr, long value);
 long hl_get_voltage(struct hl_device *hdev, int sensor_index, u32 attr);
 long hl_get_current(struct hl_device *hdev, int sensor_index, u32 attr);
 long hl_get_fan_speed(struct hl_device *hdev, int sensor_index, u32 attr);
diff --git a/drivers/misc/habanalabs/hwmon.c b/drivers/misc/habanalabs/hwmon.c
index 7be4bace9b4f..70088fdb0a5b 100644
--- a/drivers/misc/habanalabs/hwmon.c
+++ b/drivers/misc/habanalabs/hwmon.c
@@ -125,6 +125,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		case hwmon_temp_crit:
 		case hwmon_temp_max_hyst:
 		case hwmon_temp_crit_hyst:
+		case hwmon_temp_offset:
 			break;
 		default:
 			return -EINVAL;
@@ -192,6 +193,15 @@ static int hl_write(struct device *dev, enum hwmon_sensor_types type,
 		return -ENODEV;
 
 	switch (type) {
+	case hwmon_temp:
+		switch (attr) {
+		case hwmon_temp_offset:
+			break;
+		default:
+			return -EINVAL;
+		}
+		hl_set_temperature(hdev, channel, attr, val);
+		break;
 	case hwmon_pwm:
 		switch (attr) {
 		case hwmon_pwm_input:
@@ -220,6 +230,8 @@ static umode_t hl_is_visible(const void *data, enum hwmon_sensor_types type,
 		case hwmon_temp_crit:
 		case hwmon_temp_crit_hyst:
 			return 0444;
+		case hwmon_temp_offset:
+			return 0644;
 		}
 		break;
 	case hwmon_in:
@@ -291,6 +303,31 @@ long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr)
 	return result;
 }
 
+int hl_set_temperature(struct hl_device *hdev,
+			int sensor_index, u32 attr, long value)
+{
+	struct armcp_packet pkt;
+	int rc;
+
+	memset(&pkt, 0, sizeof(pkt));
+
+	pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEMPERATURE_SET <<
+				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.sensor_index = __cpu_to_le16(sensor_index);
+	pkt.type = __cpu_to_le16(attr);
+	pkt.value = __cpu_to_le64(value);
+
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+						SENSORS_PKT_TIMEOUT, NULL);
+
+	if (rc)
+		dev_err(hdev->dev,
+			"Failed to set temperature of sensor %d, error %d\n",
+			sensor_index, rc);
+
+	return rc;
+}
+
 long hl_get_voltage(struct hl_device *hdev, int sensor_index, u32 attr)
 {
 	struct armcp_packet pkt;
diff --git a/drivers/misc/habanalabs/include/armcp_if.h b/drivers/misc/habanalabs/include/armcp_if.h
index e4c6699a1868..014549eaf919 100644
--- a/drivers/misc/habanalabs/include/armcp_if.h
+++ b/drivers/misc/habanalabs/include/armcp_if.h
@@ -189,6 +189,10 @@ enum pq_init_status {
  *       ArmCP to write to the structure, to prevent data corruption in case of
  *       mismatched driver/FW versions.
  *
+ * ARMCP_PACKET_TEMPERATURE_SET -
+ *       Set the value of the offset property of a specified thermal sensor.
+ *       The packet's arguments specify the desired sensor and the field to
+ *       set.
  */
 
 enum armcp_packet_id {
@@ -214,6 +218,8 @@ enum armcp_packet_id {
 	ARMCP_PACKET_MAX_POWER_GET,		/* sysfs */
 	ARMCP_PACKET_MAX_POWER_SET,		/* sysfs */
 	ARMCP_PACKET_EEPROM_DATA_GET,		/* sysfs */
+	ARMCP_RESERVED,
+	ARMCP_PACKET_TEMPERATURE_SET,		/* sysfs */
 };
 
 #define ARMCP_PACKET_FENCE_VAL	0xFE8CE7A5
@@ -271,12 +277,17 @@ enum armcp_packet_rc {
 	armcp_packet_fault
 };
 
+/*
+ * armcp_temp_type should adhere to hwmon_temp_attributes
+ * defined in Linux kernel hwmon.h file
+ */
 enum armcp_temp_type {
 	armcp_temp_input,
 	armcp_temp_max = 6,
 	armcp_temp_max_hyst,
 	armcp_temp_crit,
-	armcp_temp_crit_hyst
+	armcp_temp_crit_hyst,
+	armcp_temp_offset = 19
 };
 
 enum armcp_in_attributes {
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/5] habanalabs: modify the return values of hl_read/write routines
  2020-02-07  8:15 [PATCH 1/5] habanalabs: add debugfs write64/read64 Oded Gabbay
  2020-02-07  8:15 ` [PATCH 2/5] habanalabs: ratelimit error prints of IRQs Oded Gabbay
  2020-02-07  8:15 ` [PATCH 3/5] habanalabs: support temperature offset via sysfs Oded Gabbay
@ 2020-02-07  8:15 ` Oded Gabbay
  2020-02-07  8:15 ` [PATCH 5/5] habanalabs: provide historical maximum of various sensors Oded Gabbay
  3 siblings, 0 replies; 6+ messages in thread
From: Oded Gabbay @ 2020-02-07  8:15 UTC (permalink / raw)
  To: linux-kernel, oshpigelman, ttayar; +Cc: gregkh, Moti Haimovski

From: Moti Haimovski <mhaimovski@habana.ai>

The hl read and write routines implement the hwmon_ops read and write
interface routines respectively.
These routines are expected to return a completion status when called,
which was not the case until this commit.
This commit modifies these routines to return 0 upon success and a
negative error value upon failure.

Signed-off-by: Moti Haimovski <mhaimovski@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/habanalabs.h | 17 +++++---
 drivers/misc/habanalabs/hwmon.c      | 63 ++++++++++++++--------------
 2 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 4de12d3ff836..9472da3ef847 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -1609,13 +1609,18 @@ int hl_pci_set_dma_mask(struct hl_device *hdev, u8 dma_mask);
 
 long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr);
 void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq);
-long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr);
+int hl_get_temperature(struct hl_device *hdev,
+		       int sensor_index, u32 attr, long *value);
 int hl_set_temperature(struct hl_device *hdev,
-			int sensor_index, u32 attr, long value);
-long hl_get_voltage(struct hl_device *hdev, int sensor_index, u32 attr);
-long hl_get_current(struct hl_device *hdev, int sensor_index, u32 attr);
-long hl_get_fan_speed(struct hl_device *hdev, int sensor_index, u32 attr);
-long hl_get_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr);
+		       int sensor_index, u32 attr, long value);
+int hl_get_voltage(struct hl_device *hdev,
+		   int sensor_index, u32 attr, long *value);
+int hl_get_current(struct hl_device *hdev,
+		   int sensor_index, u32 attr, long *value);
+int hl_get_fan_speed(struct hl_device *hdev,
+		     int sensor_index, u32 attr, long *value);
+int hl_get_pwm_info(struct hl_device *hdev,
+		    int sensor_index, u32 attr, long *value);
 void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
 			long value);
 u64 hl_get_max_power(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/hwmon.c b/drivers/misc/habanalabs/hwmon.c
index 70088fdb0a5b..3539190b1caa 100644
--- a/drivers/misc/habanalabs/hwmon.c
+++ b/drivers/misc/habanalabs/hwmon.c
@@ -113,6 +113,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 			u32 attr, int channel, long *val)
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
+	int rc;
 
 	if (hl_device_disabled_or_in_reset(hdev))
 		return -ENODEV;
@@ -131,7 +132,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 			return -EINVAL;
 		}
 
-		*val = hl_get_temperature(hdev, channel, attr);
+		rc = hl_get_temperature(hdev, channel, attr, val);
 		break;
 	case hwmon_in:
 		switch (attr) {
@@ -143,7 +144,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 			return -EINVAL;
 		}
 
-		*val = hl_get_voltage(hdev, channel, attr);
+		rc = hl_get_voltage(hdev, channel, attr, val);
 		break;
 	case hwmon_curr:
 		switch (attr) {
@@ -155,7 +156,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 			return -EINVAL;
 		}
 
-		*val = hl_get_current(hdev, channel, attr);
+		rc = hl_get_current(hdev, channel, attr, val);
 		break;
 	case hwmon_fan:
 		switch (attr) {
@@ -166,7 +167,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		default:
 			return -EINVAL;
 		}
-		*val = hl_get_fan_speed(hdev, channel, attr);
+		rc = hl_get_fan_speed(hdev, channel, attr, val);
 		break;
 	case hwmon_pwm:
 		switch (attr) {
@@ -176,12 +177,12 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		default:
 			return -EINVAL;
 		}
-		*val = hl_get_pwm_info(hdev, channel, attr);
+		rc = hl_get_pwm_info(hdev, channel, attr, val);
 		break;
 	default:
 		return -EINVAL;
 	}
-	return 0;
+	return rc;
 }
 
 static int hl_write(struct device *dev, enum hwmon_sensor_types type,
@@ -277,10 +278,10 @@ static const struct hwmon_ops hl_hwmon_ops = {
 	.write = hl_write
 };
 
-long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr)
+int hl_get_temperature(struct hl_device *hdev,
+			int sensor_index, u32 attr, long *value)
 {
 	struct armcp_packet pkt;
-	long result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -291,16 +292,16 @@ long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr)
 	pkt.type = __cpu_to_le16(attr);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-			SENSORS_PKT_TIMEOUT, &result);
+			SENSORS_PKT_TIMEOUT, value);
 
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to get temperature from sensor %d, error %d\n",
 			sensor_index, rc);
-		result = 0;
+		*value = 0;
 	}
 
-	return result;
+	return rc;
 }
 
 int hl_set_temperature(struct hl_device *hdev,
@@ -328,10 +329,10 @@ int hl_set_temperature(struct hl_device *hdev,
 	return rc;
 }
 
-long hl_get_voltage(struct hl_device *hdev, int sensor_index, u32 attr)
+int hl_get_voltage(struct hl_device *hdev,
+			int sensor_index, u32 attr, long *value)
 {
 	struct armcp_packet pkt;
-	long result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -342,22 +343,22 @@ long hl_get_voltage(struct hl_device *hdev, int sensor_index, u32 attr)
 	pkt.type = __cpu_to_le16(attr);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-					SENSORS_PKT_TIMEOUT, &result);
+					SENSORS_PKT_TIMEOUT, value);
 
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to get voltage from sensor %d, error %d\n",
 			sensor_index, rc);
-		result = 0;
+		*value = 0;
 	}
 
-	return result;
+	return rc;
 }
 
-long hl_get_current(struct hl_device *hdev, int sensor_index, u32 attr)
+int hl_get_current(struct hl_device *hdev,
+			int sensor_index, u32 attr, long *value)
 {
 	struct armcp_packet pkt;
-	long result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -368,22 +369,22 @@ long hl_get_current(struct hl_device *hdev, int sensor_index, u32 attr)
 	pkt.type = __cpu_to_le16(attr);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-					SENSORS_PKT_TIMEOUT, &result);
+					SENSORS_PKT_TIMEOUT, value);
 
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to get current from sensor %d, error %d\n",
 			sensor_index, rc);
-		result = 0;
+		*value = 0;
 	}
 
-	return result;
+	return rc;
 }
 
-long hl_get_fan_speed(struct hl_device *hdev, int sensor_index, u32 attr)
+int hl_get_fan_speed(struct hl_device *hdev,
+			int sensor_index, u32 attr, long *value)
 {
 	struct armcp_packet pkt;
-	long result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -394,22 +395,22 @@ long hl_get_fan_speed(struct hl_device *hdev, int sensor_index, u32 attr)
 	pkt.type = __cpu_to_le16(attr);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-					SENSORS_PKT_TIMEOUT, &result);
+					SENSORS_PKT_TIMEOUT, value);
 
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to get fan speed from sensor %d, error %d\n",
 			sensor_index, rc);
-		result = 0;
+		*value = 0;
 	}
 
-	return result;
+	return rc;
 }
 
-long hl_get_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr)
+int hl_get_pwm_info(struct hl_device *hdev,
+			int sensor_index, u32 attr, long *value)
 {
 	struct armcp_packet pkt;
-	long result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -420,16 +421,16 @@ long hl_get_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr)
 	pkt.type = __cpu_to_le16(attr);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-					SENSORS_PKT_TIMEOUT, &result);
+					SENSORS_PKT_TIMEOUT, value);
 
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to get pwm info from sensor %d, error %d\n",
 			sensor_index, rc);
-		result = 0;
+		*value = 0;
 	}
 
-	return result;
+	return rc;
 }
 
 void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 5/5] habanalabs: provide historical maximum of various sensors
  2020-02-07  8:15 [PATCH 1/5] habanalabs: add debugfs write64/read64 Oded Gabbay
                   ` (2 preceding siblings ...)
  2020-02-07  8:15 ` [PATCH 4/5] habanalabs: modify the return values of hl_read/write routines Oded Gabbay
@ 2020-02-07  8:15 ` Oded Gabbay
  3 siblings, 0 replies; 6+ messages in thread
From: Oded Gabbay @ 2020-02-07  8:15 UTC (permalink / raw)
  To: linux-kernel, oshpigelman, ttayar; +Cc: gregkh, Christine Gharzuzi

From: Christine Gharzuzi <cgharzuzi@habana.ai>

Add support for hwmon_in_highest, hwmon_temp_highest and hwmon_curr_highest
attributes. These attributes retrieve the historical maximum voltage,
temperature and current that were sampled, respectively.

Signed-off-by: Christine Gharzuzi <cgharzuzi@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/hwmon.c            | 6 ++++++
 drivers/misc/habanalabs/include/armcp_if.h | 9 ++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/hwmon.c b/drivers/misc/habanalabs/hwmon.c
index 3539190b1caa..a21a26e07c3b 100644
--- a/drivers/misc/habanalabs/hwmon.c
+++ b/drivers/misc/habanalabs/hwmon.c
@@ -127,6 +127,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		case hwmon_temp_max_hyst:
 		case hwmon_temp_crit_hyst:
 		case hwmon_temp_offset:
+		case hwmon_temp_highest:
 			break;
 		default:
 			return -EINVAL;
@@ -139,6 +140,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		case hwmon_in_input:
 		case hwmon_in_min:
 		case hwmon_in_max:
+		case hwmon_in_highest:
 			break;
 		default:
 			return -EINVAL;
@@ -151,6 +153,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		case hwmon_curr_input:
 		case hwmon_curr_min:
 		case hwmon_curr_max:
+		case hwmon_curr_highest:
 			break;
 		default:
 			return -EINVAL;
@@ -230,6 +233,7 @@ static umode_t hl_is_visible(const void *data, enum hwmon_sensor_types type,
 		case hwmon_temp_max_hyst:
 		case hwmon_temp_crit:
 		case hwmon_temp_crit_hyst:
+		case hwmon_temp_highest:
 			return 0444;
 		case hwmon_temp_offset:
 			return 0644;
@@ -240,6 +244,7 @@ static umode_t hl_is_visible(const void *data, enum hwmon_sensor_types type,
 		case hwmon_in_input:
 		case hwmon_in_min:
 		case hwmon_in_max:
+		case hwmon_in_highest:
 			return 0444;
 		}
 		break;
@@ -248,6 +253,7 @@ static umode_t hl_is_visible(const void *data, enum hwmon_sensor_types type,
 		case hwmon_curr_input:
 		case hwmon_curr_min:
 		case hwmon_curr_max:
+		case hwmon_curr_highest:
 			return 0444;
 		}
 		break;
diff --git a/drivers/misc/habanalabs/include/armcp_if.h b/drivers/misc/habanalabs/include/armcp_if.h
index 014549eaf919..bdd0a4c3a9cf 100644
--- a/drivers/misc/habanalabs/include/armcp_if.h
+++ b/drivers/misc/habanalabs/include/armcp_if.h
@@ -287,19 +287,22 @@ enum armcp_temp_type {
 	armcp_temp_max_hyst,
 	armcp_temp_crit,
 	armcp_temp_crit_hyst,
-	armcp_temp_offset = 19
+	armcp_temp_offset = 19,
+	armcp_temp_highest = 22
 };
 
 enum armcp_in_attributes {
 	armcp_in_input,
 	armcp_in_min,
-	armcp_in_max
+	armcp_in_max,
+	armcp_in_highest = 7
 };
 
 enum armcp_curr_attributes {
 	armcp_curr_input,
 	armcp_curr_min,
-	armcp_curr_max
+	armcp_curr_max,
+	armcp_curr_highest = 7
 };
 
 enum armcp_fan_attributes {
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* RE: [PATCH 2/5] habanalabs: ratelimit error prints of IRQs
  2020-02-07  8:15 ` [PATCH 2/5] habanalabs: ratelimit error prints of IRQs Oded Gabbay
@ 2020-02-09  6:33   ` Omer Shpigelman
  0 siblings, 0 replies; 6+ messages in thread
From: Omer Shpigelman @ 2020-02-09  6:33 UTC (permalink / raw)
  To: Oded Gabbay, linux-kernel, Tomer Tayar; +Cc: gregkh

On Fri, Feb 7, 2020 at 10:15 AM Oded Gabbay <oded.gabbay@gmail.com> wrote:
> The compute engines can perform millions of transactions per second.
> If there is a bug in the S/W stack, we could get a lot of interrupts and
> spam the kernel log. Therefore, ratelimit these prints
>
> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>

Reviewed-by: Omer Shpigelman <oshpigelman@habana.ai>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-02-09  6:34 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-07  8:15 [PATCH 1/5] habanalabs: add debugfs write64/read64 Oded Gabbay
2020-02-07  8:15 ` [PATCH 2/5] habanalabs: ratelimit error prints of IRQs Oded Gabbay
2020-02-09  6:33   ` Omer Shpigelman
2020-02-07  8:15 ` [PATCH 3/5] habanalabs: support temperature offset via sysfs Oded Gabbay
2020-02-07  8:15 ` [PATCH 4/5] habanalabs: modify the return values of hl_read/write routines Oded Gabbay
2020-02-07  8:15 ` [PATCH 5/5] habanalabs: provide historical maximum of various sensors Oded Gabbay

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.