linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] habanalabs: add 'needs reset' state in driver
@ 2020-11-04 14:08 Oded Gabbay
  2020-11-04 14:08 ` [PATCH] habanalabs: fix hard reset print and comment Oded Gabbay
                   ` (10 more replies)
  0 siblings, 11 replies; 12+ messages in thread
From: Oded Gabbay @ 2020-11-04 14:08 UTC (permalink / raw)
  To: linux-kernel; +Cc: SW_Drivers, Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

The new state indicates that device should be reset in order
to re-gain funcionality.
This unique state can occur if reset_on_lockup is disabled
and an actual lockup has occurred.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../misc/habanalabs/common/command_buffer.c   |  5 +--
 .../habanalabs/common/command_submission.c    |  7 ++--
 drivers/misc/habanalabs/common/debugfs.c      |  6 ++--
 drivers/misc/habanalabs/common/device.c       | 33 ++++++++++++++-----
 drivers/misc/habanalabs/common/habanalabs.h   | 14 ++++++--
 .../misc/habanalabs/common/habanalabs_drv.c   | 17 +++++++---
 .../misc/habanalabs/common/habanalabs_ioctl.c | 12 ++++---
 drivers/misc/habanalabs/common/hw_queue.c     |  6 ++--
 drivers/misc/habanalabs/common/hwmon.c        |  4 +--
 drivers/misc/habanalabs/common/memory.c       |  5 +--
 drivers/misc/habanalabs/common/sysfs.c        |  8 +++--
 drivers/misc/habanalabs/gaudi/gaudi_hwmgr.c   |  8 ++---
 drivers/misc/habanalabs/goya/goya_hwmgr.c     | 28 ++++++++--------
 include/uapi/misc/habanalabs.h                |  3 +-
 14 files changed, 101 insertions(+), 55 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c
index 1fb05bf02340..360dc340b2f4 100644
--- a/drivers/misc/habanalabs/common/command_buffer.c
+++ b/drivers/misc/habanalabs/common/command_buffer.c
@@ -380,13 +380,14 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
 {
 	union hl_cb_args *args = data;
 	struct hl_device *hdev = hpriv->hdev;
+	enum hl_device_status status;
 	u64 handle = 0;
 	int rc;
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, &status)) {
 		dev_warn_ratelimited(hdev->dev,
 			"Device is %s. Can't execute CB IOCTL\n",
-			atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
+			hdev->status[status]);
 		return -EBUSY;
 	}
 
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 662b19663839..ec014ef39484 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -427,6 +427,8 @@ static void cs_timedout(struct work_struct *work)
 
 	if (hdev->reset_on_lockup)
 		hl_device_reset(hdev, false, false);
+	else
+		hdev->needs_reset = true;
 }
 
 static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
@@ -689,12 +691,13 @@ static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
 	struct hl_device *hdev = hpriv->hdev;
 	struct hl_ctx *ctx = hpriv->ctx;
 	u32 cs_type_flags, num_chunks;
+	enum hl_device_status status;
 	enum hl_cs_type cs_type;
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, &status)) {
 		dev_warn_ratelimited(hdev->dev,
 			"Device is %s. Can't submit new CS\n",
-			atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
+			hdev->status[status]);
 		return -EBUSY;
 	}
 
diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index b44193ec3d12..104b9686e57b 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -24,7 +24,7 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 	struct cpucp_packet pkt;
 	int rc;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -EBUSY;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -50,7 +50,7 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 	struct cpucp_packet pkt;
 	int rc;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -EBUSY;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -76,7 +76,7 @@ static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
 	struct cpucp_packet pkt;
 	int rc;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return;
 
 	memset(&pkt, 0, sizeof(pkt));
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 3b82020648c7..59308a612b36 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -15,14 +15,6 @@
 
 #define HL_PLDM_PENDING_RESET_PER_SEC	(HL_PENDING_RESET_PER_SEC * 10)
 
-bool hl_device_disabled_or_in_reset(struct hl_device *hdev)
-{
-	if ((hdev->disabled) || (atomic_read(&hdev->in_reset)))
-		return true;
-	else
-		return false;
-}
-
 enum hl_device_status hl_device_status(struct hl_device *hdev)
 {
 	enum hl_device_status status;
@@ -31,12 +23,34 @@ enum hl_device_status hl_device_status(struct hl_device *hdev)
 		status = HL_DEVICE_STATUS_MALFUNCTION;
 	else if (atomic_read(&hdev->in_reset))
 		status = HL_DEVICE_STATUS_IN_RESET;
+	else if (hdev->needs_reset)
+		status = HL_DEVICE_STATUS_NEEDS_RESET;
 	else
 		status = HL_DEVICE_STATUS_OPERATIONAL;
 
 	return status;
 }
 
+bool hl_device_operational(struct hl_device *hdev,
+		enum hl_device_status *status)
+{
+	enum hl_device_status current_status;
+
+	current_status = hl_device_status(hdev);
+	if (status)
+		*status = current_status;
+
+	switch (current_status) {
+	case HL_DEVICE_STATUS_IN_RESET:
+	case HL_DEVICE_STATUS_MALFUNCTION:
+	case HL_DEVICE_STATUS_NEEDS_RESET:
+		return false;
+	case HL_DEVICE_STATUS_OPERATIONAL:
+	default:
+		return true;
+	}
+}
+
 static void hpriv_release(struct kref *ref)
 {
 	struct hl_fpriv *hpriv;
@@ -411,7 +425,7 @@ static void hl_device_heartbeat(struct work_struct *work)
 	struct hl_device *hdev = container_of(work, struct hl_device,
 						work_heartbeat.work);
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		goto reschedule;
 
 	if (!hdev->asic_funcs->send_heartbeat(hdev))
@@ -1091,6 +1105,7 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 	}
 
 	atomic_set(&hdev->in_reset, 0);
+	hdev->needs_reset = false;
 
 	if (hard_reset)
 		hdev->hard_reset_cnt++;
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 2e3021adc824..05e53b5ffcfe 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1432,6 +1432,10 @@ struct hl_dbg_device_entry {
  * DEVICES
  */
 
+#define HL_STR_MAX	32
+
+#define HL_DEV_STS_MAX (HL_DEVICE_STATUS_NEEDS_RESET + 1)
+
 /* Theoretical limit only. A single host can only contain up to 4 or 8 PCIe
  * x16 cards. In extreme cases, there are hosts that can accommodate 16 cards.
  */
@@ -1706,6 +1710,7 @@ struct hl_mmu_funcs {
  * @hwmon_dev: H/W monitor device.
  * @pm_mng_profile: current power management profile.
  * @hl_chip_info: ASIC's sensors information.
+ * @device_status_description: device status description.
  * @hl_debugfs: device's debugfs manager.
  * @cb_pool: list of preallocated CBs.
  * @cb_pool_lock: protects the CB pool.
@@ -1774,6 +1779,8 @@ struct hl_mmu_funcs {
  * @supports_coresight: is CoreSight supported.
  * @supports_soft_reset: is soft reset supported.
  * @supports_cb_mapping: is mapping a CB to the device's MMU supported.
+ * @needs_reset: true if reset_on_lockup is false and device should be reset
+ *               due to lockup.
  */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -1786,7 +1793,8 @@ struct hl_device {
 	struct device			*dev_ctrl;
 	struct delayed_work		work_freq;
 	struct delayed_work		work_heartbeat;
-	char				asic_name[32];
+	char				asic_name[HL_STR_MAX];
+	char				status[HL_DEV_STS_MAX][HL_STR_MAX];
 	enum hl_asic_type		asic_type;
 	struct hl_cq			*completion_queue;
 	struct workqueue_struct		**cq_wq;
@@ -1876,6 +1884,7 @@ struct hl_device {
 	u8				supports_coresight;
 	u8				supports_soft_reset;
 	u8				supports_cb_mapping;
+	u8				needs_reset;
 
 	/* Parameters for bring-up */
 	u64				nic_ports_mask;
@@ -1978,7 +1987,8 @@ static inline bool hl_mem_area_crosses_range(u64 address, u32 size,
 
 int hl_device_open(struct inode *inode, struct file *filp);
 int hl_device_open_ctrl(struct inode *inode, struct file *filp);
-bool hl_device_disabled_or_in_reset(struct hl_device *hdev);
+bool hl_device_operational(struct hl_device *hdev,
+		enum hl_device_status *status);
 enum hl_device_status hl_device_status(struct hl_device *hdev);
 int hl_device_set_debug_mode(struct hl_device *hdev, bool enable);
 int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index 20458bd82c5a..aac798f3296e 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -92,6 +92,7 @@ static enum hl_asic_type get_asic_type(u16 device)
  */
 int hl_device_open(struct inode *inode, struct file *filp)
 {
+	enum hl_device_status status;
 	struct hl_device *hdev;
 	struct hl_fpriv *hpriv;
 	int rc;
@@ -124,10 +125,10 @@ int hl_device_open(struct inode *inode, struct file *filp)
 
 	mutex_lock(&hdev->fpriv_list_lock);
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, &status)) {
 		dev_err_ratelimited(hdev->dev,
-			"Can't open %s because it is disabled or in reset\n",
-			dev_name(hdev->dev));
+			"Can't open %s because it is %s\n",
+			dev_name(hdev->dev), hdev->status[status]);
 		rc = -EPERM;
 		goto out_err;
 	}
@@ -204,7 +205,7 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp)
 
 	mutex_lock(&hdev->fpriv_list_lock);
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, NULL)) {
 		dev_err_ratelimited(hdev->dev_ctrl,
 			"Can't open %s because it is disabled or in reset\n",
 			dev_name(hdev->dev_ctrl));
@@ -287,6 +288,14 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 		hdev->asic_type = asic_type;
 	}
 
+	/* Assign status description string */
+	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION],
+					"disabled", HL_STR_MAX);
+	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET],
+					"in reset", HL_STR_MAX);
+	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET],
+					"needs reset", HL_STR_MAX);
+
 	hdev->major = hl_major;
 	hdev->reset_on_lockup = reset_on_lockup;
 	hdev->memory_scrub = memory_scrub;
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index 1d8bea626e78..0729cd43f297 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -406,8 +406,10 @@ static int total_energy_consumption_info(struct hl_fpriv *hpriv,
 static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 				struct device *dev)
 {
+	enum hl_device_status status;
 	struct hl_info_args *args = data;
 	struct hl_device *hdev = hpriv->hdev;
+
 	int rc;
 
 	/*
@@ -428,10 +430,10 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 		break;
 	}
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, &status)) {
 		dev_warn_ratelimited(dev,
 			"Device is %s. Can't execute INFO IOCTL\n",
-			atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
+			hdev->status[status]);
 		return -EBUSY;
 	}
 
@@ -501,12 +503,14 @@ static int hl_debug_ioctl(struct hl_fpriv *hpriv, void *data)
 {
 	struct hl_debug_args *args = data;
 	struct hl_device *hdev = hpriv->hdev;
+	enum hl_device_status status;
+
 	int rc = 0;
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, &status)) {
 		dev_warn_ratelimited(hdev->dev,
 			"Device is %s. Can't execute DEBUG IOCTL\n",
-			atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
+			hdev->status[status]);
 		return -EBUSY;
 	}
 
diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c
index 84a7458f3363..606d27112dca 100644
--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -516,6 +516,7 @@ static void init_signal_wait_cs(struct hl_cs *cs)
  */
 int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 {
+	enum hl_device_status status;
 	struct hl_cs_counters_atomic *cntr;
 	struct hl_ctx *ctx = cs->ctx;
 	struct hl_device *hdev = ctx->hdev;
@@ -528,11 +529,10 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 
 	hdev->asic_funcs->hw_queues_lock(hdev);
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, &status)) {
 		atomic64_inc(&ctx->cs_counters.device_in_reset_drop_cnt);
-		atomic64_inc(&cntr->device_in_reset_drop_cnt);
 		dev_err(hdev->dev,
-			"device is disabled or in reset, CS rejected!\n");
+			"device is %s, CS rejected!\n", hdev->status[status]);
 		rc = -EPERM;
 		goto out;
 	}
diff --git a/drivers/misc/habanalabs/common/hwmon.c b/drivers/misc/habanalabs/common/hwmon.c
index 892a5e2b0b9d..ab96401c3752 100644
--- a/drivers/misc/habanalabs/common/hwmon.c
+++ b/drivers/misc/habanalabs/common/hwmon.c
@@ -114,7 +114,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 	struct hl_device *hdev = dev_get_drvdata(dev);
 	int rc;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	switch (type) {
@@ -192,7 +192,7 @@ static int hl_write(struct device *dev, enum hwmon_sensor_types type,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	switch (type) {
diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index 5c1dae6aaf4d..e00ad11dc5f7 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -1237,6 +1237,7 @@ static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args)
 
 int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
 {
+	enum hl_device_status status;
 	union hl_mem_args *args = data;
 	struct hl_device *hdev = hpriv->hdev;
 	struct hl_ctx *ctx = hpriv->ctx;
@@ -1244,10 +1245,10 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
 	u32 handle = 0;
 	int rc;
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, &status)) {
 		dev_warn_ratelimited(hdev->dev,
 			"Device is %s. Can't execute MEMORY IOCTL\n",
-			atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
+			hdev->status[status]);
 		return -EBUSY;
 	}
 
diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c
index 3ceae87016b1..94ca68e62000 100644
--- a/drivers/misc/habanalabs/common/sysfs.c
+++ b/drivers/misc/habanalabs/common/sysfs.c
@@ -276,6 +276,8 @@ static ssize_t status_show(struct device *dev, struct device_attribute *attr,
 		str = "In reset";
 	else if (hdev->disabled)
 		str = "Malfunction";
+	else if (hdev->needs_reset)
+		str = "Needs Reset";
 	else
 		str = "Operational";
 
@@ -304,7 +306,7 @@ static ssize_t max_power_show(struct device *dev, struct device_attribute *attr,
 	struct hl_device *hdev = dev_get_drvdata(dev);
 	long val;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	val = hl_get_max_power(hdev);
@@ -319,7 +321,7 @@ static ssize_t max_power_store(struct device *dev,
 	unsigned long value;
 	int rc;
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, NULL)) {
 		count = -ENODEV;
 		goto out;
 	}
@@ -347,7 +349,7 @@ static ssize_t eeprom_read_handler(struct file *filp, struct kobject *kobj,
 	char *data;
 	int rc;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	if (!max_size)
diff --git a/drivers/misc/habanalabs/gaudi/gaudi_hwmgr.c b/drivers/misc/habanalabs/gaudi/gaudi_hwmgr.c
index 1076b4932ce2..8c49da4bcbd5 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi_hwmgr.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi_hwmgr.c
@@ -20,7 +20,7 @@ int gaudi_get_clk_rate(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk)
 {
 	long value;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	value = hl_get_frequency(hdev, MME_PLL, false);
@@ -54,7 +54,7 @@ static ssize_t clk_max_freq_mhz_show(struct device *dev,
 	struct gaudi_device *gaudi = hdev->asic_specific;
 	long value;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	value = hl_get_frequency(hdev, MME_PLL, false);
@@ -72,7 +72,7 @@ static ssize_t clk_max_freq_mhz_store(struct device *dev,
 	int rc;
 	u64 value;
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, NULL)) {
 		count = -ENODEV;
 		goto fail;
 	}
@@ -97,7 +97,7 @@ static ssize_t clk_cur_freq_mhz_show(struct device *dev,
 	struct hl_device *hdev = dev_get_drvdata(dev);
 	long value;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	value = hl_get_frequency(hdev, MME_PLL, true);
diff --git a/drivers/misc/habanalabs/goya/goya_hwmgr.c b/drivers/misc/habanalabs/goya/goya_hwmgr.c
index cdd4903e48fa..3acb36a1a902 100644
--- a/drivers/misc/habanalabs/goya/goya_hwmgr.c
+++ b/drivers/misc/habanalabs/goya/goya_hwmgr.c
@@ -36,7 +36,7 @@ int goya_get_clk_rate(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk)
 {
 	long value;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	value = hl_get_frequency(hdev, MME_PLL, false);
@@ -69,7 +69,7 @@ static ssize_t mme_clk_show(struct device *dev, struct device_attribute *attr,
 	struct hl_device *hdev = dev_get_drvdata(dev);
 	long value;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	value = hl_get_frequency(hdev, MME_PLL, false);
@@ -88,7 +88,7 @@ static ssize_t mme_clk_store(struct device *dev, struct device_attribute *attr,
 	int rc;
 	long value;
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, NULL)) {
 		count = -ENODEV;
 		goto fail;
 	}
@@ -118,7 +118,7 @@ static ssize_t tpc_clk_show(struct device *dev, struct device_attribute *attr,
 	struct hl_device *hdev = dev_get_drvdata(dev);
 	long value;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	value = hl_get_frequency(hdev, TPC_PLL, false);
@@ -137,7 +137,7 @@ static ssize_t tpc_clk_store(struct device *dev, struct device_attribute *attr,
 	int rc;
 	long value;
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, NULL)) {
 		count = -ENODEV;
 		goto fail;
 	}
@@ -167,7 +167,7 @@ static ssize_t ic_clk_show(struct device *dev, struct device_attribute *attr,
 	struct hl_device *hdev = dev_get_drvdata(dev);
 	long value;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	value = hl_get_frequency(hdev, IC_PLL, false);
@@ -186,7 +186,7 @@ static ssize_t ic_clk_store(struct device *dev, struct device_attribute *attr,
 	int rc;
 	long value;
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, NULL)) {
 		count = -ENODEV;
 		goto fail;
 	}
@@ -216,7 +216,7 @@ static ssize_t mme_clk_curr_show(struct device *dev,
 	struct hl_device *hdev = dev_get_drvdata(dev);
 	long value;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	value = hl_get_frequency(hdev, MME_PLL, true);
@@ -233,7 +233,7 @@ static ssize_t tpc_clk_curr_show(struct device *dev,
 	struct hl_device *hdev = dev_get_drvdata(dev);
 	long value;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	value = hl_get_frequency(hdev, TPC_PLL, true);
@@ -250,7 +250,7 @@ static ssize_t ic_clk_curr_show(struct device *dev,
 	struct hl_device *hdev = dev_get_drvdata(dev);
 	long value;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	value = hl_get_frequency(hdev, IC_PLL, true);
@@ -266,7 +266,7 @@ static ssize_t pm_mng_profile_show(struct device *dev,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	return sprintf(buf, "%s\n",
@@ -280,7 +280,7 @@ static ssize_t pm_mng_profile_store(struct device *dev,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, NULL)) {
 		count = -ENODEV;
 		goto out;
 	}
@@ -335,7 +335,7 @@ static ssize_t high_pll_show(struct device *dev, struct device_attribute *attr,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	return sprintf(buf, "%u\n", hdev->high_pll);
@@ -348,7 +348,7 @@ static ssize_t high_pll_store(struct device *dev, struct device_attribute *attr,
 	long value;
 	int rc;
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, NULL)) {
 		count = -ENODEV;
 		goto out;
 	}
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 61f8f9144b54..d9cc782aba21 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -242,7 +242,8 @@ enum gaudi_engine_id {
 enum hl_device_status {
 	HL_DEVICE_STATUS_OPERATIONAL,
 	HL_DEVICE_STATUS_IN_RESET,
-	HL_DEVICE_STATUS_MALFUNCTION
+	HL_DEVICE_STATUS_MALFUNCTION,
+	HL_DEVICE_STATUS_NEEDS_RESET
 };
 
 /* Opcode for management ioctl
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] habanalabs: fix hard reset print and comment
  2020-11-04 14:08 [PATCH] habanalabs: add 'needs reset' state in driver Oded Gabbay
@ 2020-11-04 14:08 ` Oded Gabbay
  2020-11-04 14:08 ` [PATCH] habanalabs/gaudi: increase MAX CS to 16K Oded Gabbay
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2020-11-04 14:08 UTC (permalink / raw)
  To: linux-kernel; +Cc: SW_Drivers, Omer Shpigelman

From: Omer Shpigelman <oshpigelman@habana.ai>

One of the first steps of a hard reset flow is to close all open user
contexts. This user process teradown might take some time due to long
cleanup in our driver or some other reason even before our cleanup flow.
Hence fix the relevant print and comment to be more accurate.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 421e37123b03..3b82020648c7 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -793,17 +793,20 @@ static int device_kill_open_processes(struct hl_device *hdev)
 
 	mutex_unlock(&hdev->fpriv_list_lock);
 
-	/* We killed the open users, but because the driver cleans up after the
-	 * user contexts are closed (e.g. mmu mappings), we need to wait again
-	 * to make sure the cleaning phase is finished before continuing with
-	 * the reset
+	/*
+	 * We killed the open users, but that doesn't mean they are closed.
+	 * It could be that they are running a long cleanup phase in the driver
+	 * e.g. MMU unmappings, or running other long teardown flow even before
+	 * our cleanup.
+	 * Therefore we need to wait again to make sure they are closed before
+	 * continuing with the reset.
 	 */
 
 	pending_cnt = pending_total;
 
 	while ((!list_empty(&hdev->fpriv_list)) && (pending_cnt)) {
 		dev_info(hdev->dev,
-			"Waiting for all unmap operations to finish before hard reset\n");
+			"Waiting for all user contexts to get closed before hard reset\n");
 
 		pending_cnt--;
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] habanalabs/gaudi: increase MAX CS to 16K
  2020-11-04 14:08 [PATCH] habanalabs: add 'needs reset' state in driver Oded Gabbay
  2020-11-04 14:08 ` [PATCH] habanalabs: fix hard reset print and comment Oded Gabbay
@ 2020-11-04 14:08 ` Oded Gabbay
  2020-11-04 14:09 ` [PATCH] habanalabs/gaudi: remove pcie_en strap toggle Oded Gabbay
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2020-11-04 14:08 UTC (permalink / raw)
  To: linux-kernel; +Cc: SW_Drivers

We need to have the MAX CS be much larger than the size of the
different queues. In GAUDI we have around 8 groups of queues, and each
group has 1K queue size. To prevent head-of-the-line blocking, we need
to make sure there is sufficient number of available CS allocations
even if one or more of those queues are full.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudiP.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h
index 1dcf7c41e698..c3c24e8da4e2 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -60,7 +60,7 @@
 
 #define GAUDI_DEFAULT_CARD_NAME		"HL2000"
 
-#define GAUDI_MAX_PENDING_CS		1024
+#define GAUDI_MAX_PENDING_CS		SZ_16K
 
 #if !IS_MAX_PENDING_CS_VALID(GAUDI_MAX_PENDING_CS)
 #error "GAUDI_MAX_PENDING_CS must be power of 2 and greater than 1"
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] habanalabs/gaudi: remove pcie_en strap toggle
  2020-11-04 14:08 [PATCH] habanalabs: add 'needs reset' state in driver Oded Gabbay
  2020-11-04 14:08 ` [PATCH] habanalabs: fix hard reset print and comment Oded Gabbay
  2020-11-04 14:08 ` [PATCH] habanalabs/gaudi: increase MAX CS to 16K Oded Gabbay
@ 2020-11-04 14:09 ` Oded Gabbay
  2020-11-04 14:09 ` [PATCH] habanalabs: Move repeatedly included headers to habanalabs.h Oded Gabbay
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2020-11-04 14:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: SW_Drivers, Igor Grinberg

From: Igor Grinberg <igrinberg@habana.ai>

Since the very large grace period is over and this functionality
prevents us to implement the new reset sequence and apply security
settings, we need to remove the code toggling the PCIE_EN bit in the
straps register.
Remove it for good.

Signed-off-by: Igor Grinberg <igrinberg@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index ae4f3669261d..c075f7f10843 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -3871,7 +3871,7 @@ static int gaudi_hw_init(struct hl_device *hdev)
 static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
 {
 	struct gaudi_device *gaudi = hdev->asic_specific;
-	u32 status, reset_timeout_ms, cpu_timeout_ms, boot_strap = 0;
+	u32 status, reset_timeout_ms, cpu_timeout_ms;
 
 	if (!hard_reset) {
 		dev_err(hdev->dev, "GAUDI doesn't support soft-reset\n");
@@ -3903,16 +3903,6 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
 	/* Tell ASIC not to re-initialize PCIe */
 	WREG32(mmPREBOOT_PCIE_EN, LKD_HARD_RESET_MAGIC);
 
-	boot_strap = RREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS);
-
-	/* H/W bug WA:
-	 * rdata[31:0] = strap_read_val;
-	 * wdata[31:0] = rdata[30:21],1'b0,rdata[20:0]
-	 */
-	boot_strap = (((boot_strap & 0x7FE00000) << 1) |
-			(boot_strap & 0x001FFFFF));
-	WREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS, boot_strap & ~0x2);
-
 	/* Restart BTL/BLR upon hard-reset */
 	if (hdev->asic_prop.fw_security_disabled)
 		WREG32(mmPSOC_GLOBAL_CONF_BOOT_SEQ_RE_START, 1);
@@ -3935,8 +3925,6 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
 			"Timeout while waiting for device to reset 0x%x\n",
 			status);
 
-	WREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS, boot_strap);
-
 	if (gaudi) {
 		gaudi->hw_cap_initialized &= ~(HW_CAP_CPU | HW_CAP_CPU_Q |
 				HW_CAP_HBM | HW_CAP_PCI_DMA |
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] habanalabs: Move repeatedly included headers to habanalabs.h
  2020-11-04 14:08 [PATCH] habanalabs: add 'needs reset' state in driver Oded Gabbay
                   ` (2 preceding siblings ...)
  2020-11-04 14:09 ` [PATCH] habanalabs/gaudi: remove pcie_en strap toggle Oded Gabbay
@ 2020-11-04 14:09 ` Oded Gabbay
  2020-11-04 14:09 ` [PATCH] habanalabs: release signal if collective wait was dropped Oded Gabbay
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2020-11-04 14:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: SW_Drivers, Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

Several header files are repeatedly included in many files.
Move these files to habanalabs.h which is included by all.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_buffer.c | 1 -
 drivers/misc/habanalabs/common/device.c         | 1 -
 drivers/misc/habanalabs/common/firmware_if.c    | 2 --
 drivers/misc/habanalabs/common/habanalabs.h     | 4 ++++
 drivers/misc/habanalabs/common/memory.c         | 1 -
 drivers/misc/habanalabs/common/mmu_v1.c         | 1 -
 drivers/misc/habanalabs/gaudi/gaudi.c           | 2 --
 drivers/misc/habanalabs/gaudi/gaudi_coresight.c | 2 --
 drivers/misc/habanalabs/goya/goya.c             | 2 --
 drivers/misc/habanalabs/goya/goya_coresight.c   | 2 --
 10 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c
index 901e213daf40..1fb05bf02340 100644
--- a/drivers/misc/habanalabs/common/command_buffer.c
+++ b/drivers/misc/habanalabs/common/command_buffer.c
@@ -11,7 +11,6 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
-#include <linux/genalloc.h>
 
 static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb)
 {
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 20572224099a..421e37123b03 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -10,7 +10,6 @@
 #include "habanalabs.h"
 
 #include <linux/pci.h>
-#include <linux/sched/signal.h>
 #include <linux/hwmon.h>
 #include <uapi/misc/habanalabs.h>
 
diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index d84a70ec0ce1..fb9d901d5059 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -9,8 +9,6 @@
 #include "../include/common/hl_boot_if.h"
 
 #include <linux/firmware.h>
-#include <linux/genalloc.h>
-#include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/slab.h>
 
 #define FW_FILE_MAX_SIZE	0x1400000 /* maximum size of 20MB */
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index ce516e9e1ebe..9f5a3c07f421 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -19,6 +19,10 @@
 #include <linux/scatterlist.h>
 #include <linux/hashtable.h>
 #include <linux/bitfield.h>
+#include <linux/genalloc.h>
+#include <linux/sched/signal.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/coresight.h>
 
 #define HL_NAME				"habanalabs"
 
diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index 75e269bc42a7..5c1dae6aaf4d 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -11,7 +11,6 @@
 
 #include <linux/uaccess.h>
 #include <linux/slab.h>
-#include <linux/genalloc.h>
 
 #define HL_MMU_DEBUG	0
 
diff --git a/drivers/misc/habanalabs/common/mmu_v1.c b/drivers/misc/habanalabs/common/mmu_v1.c
index ec7e8a3c37b8..ac3784523baa 100644
--- a/drivers/misc/habanalabs/common/mmu_v1.c
+++ b/drivers/misc/habanalabs/common/mmu_v1.c
@@ -8,7 +8,6 @@
 #include "habanalabs.h"
 #include "../include/hw_ip/mmu/mmu_general.h"
 
-#include <linux/genalloc.h>
 #include <linux/slab.h>
 
 static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr);
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 9e38ac6f7264..ae73ffb4ad64 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -17,8 +17,6 @@
 #include <linux/pci.h>
 #include <linux/firmware.h>
 #include <linux/hwmon.h>
-#include <linux/genalloc.h>
-#include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/iommu.h>
 #include <linux/seq_file.h>
 
diff --git a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c
index 881531d4d9da..1f0edf78fba1 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c
@@ -11,8 +11,6 @@
 #include "../include/gaudi/gaudi_masks.h"
 
 #include <uapi/misc/habanalabs.h>
-#include <linux/coresight.h>
-
 #define SPMU_SECTION_SIZE		MME0_ACC_SPMU_MAX_OFFSET
 #define SPMU_EVENT_TYPES_OFFSET		0x400
 #define SPMU_MAX_COUNTERS		6
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 80df2f3d438c..1b81102d1bec 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -12,9 +12,7 @@
 #include "../include/goya/goya_reg_map.h"
 
 #include <linux/pci.h>
-#include <linux/genalloc.h>
 #include <linux/hwmon.h>
-#include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/iommu.h>
 #include <linux/seq_file.h>
 
diff --git a/drivers/misc/habanalabs/goya/goya_coresight.c b/drivers/misc/habanalabs/goya/goya_coresight.c
index 4027a6a334d7..6fa03933b438 100644
--- a/drivers/misc/habanalabs/goya/goya_coresight.c
+++ b/drivers/misc/habanalabs/goya/goya_coresight.c
@@ -12,8 +12,6 @@
 
 #include <uapi/misc/habanalabs.h>
 
-#include <linux/coresight.h>
-
 #define GOYA_PLDM_CORESIGHT_TIMEOUT_USEC	(CORESIGHT_TIMEOUT_USEC * 100)
 
 #define SPMU_SECTION_SIZE		DMA_CH_0_CS_SPMU_MAX_OFFSET
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] habanalabs: release signal if collective wait was dropped
  2020-11-04 14:08 [PATCH] habanalabs: add 'needs reset' state in driver Oded Gabbay
                   ` (3 preceding siblings ...)
  2020-11-04 14:09 ` [PATCH] habanalabs: Move repeatedly included headers to habanalabs.h Oded Gabbay
@ 2020-11-04 14:09 ` Oded Gabbay
  2020-11-04 14:09 ` [PATCH] habanalabs: remove duplicate print Oded Gabbay
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2020-11-04 14:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: SW_Drivers, Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

As in standard wait cs, we must release a signal fence once
a collective wait cs was dropped and not submitted.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_submission.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 18cd20eec4c5..6de4b161856c 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -300,9 +300,11 @@ static void cs_do_release(struct kref *ref)
 
 	if (!cs->submitted) {
 		/* In case the wait for signal CS was submitted, the put occurs
-		 * in init_signal_wait_cs() right before hanging on the PQ.
+		 * in init_signal_wait_cs() or collective_wait_init_cs()
+		 * right before hanging on the PQ.
 		 */
-		if (cs->type == CS_TYPE_WAIT)
+		if (cs->type == CS_TYPE_WAIT ||
+				cs->type == CS_TYPE_COLLECTIVE_WAIT)
 			hl_fence_put(cs->signal_fence);
 
 		goto out;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] habanalabs: remove duplicate print
  2020-11-04 14:08 [PATCH] habanalabs: add 'needs reset' state in driver Oded Gabbay
                   ` (4 preceding siblings ...)
  2020-11-04 14:09 ` [PATCH] habanalabs: release signal if collective wait was dropped Oded Gabbay
@ 2020-11-04 14:09 ` Oded Gabbay
  2020-11-04 14:09 ` [PATCH] habanalabs: reset device upon fw read failure Oded Gabbay
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2020-11-04 14:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: SW_Drivers

We print twice the firmware status regarding security, once in
common code and once in asic code. Remove the print in asic code
and leave the common code print.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 3 ---
 drivers/misc/habanalabs/goya/goya.c   | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 8abe46b8d615..ae4f3669261d 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -647,9 +647,6 @@ static int gaudi_early_init(struct hl_device *hdev)
 	if (rc)
 		goto free_queue_props;
 
-	dev_info(hdev->dev, "firmware-level security is %s\n",
-		hdev->asic_prop.fw_security_disabled ? "disabled" : "enabled");
-
 	return 0;
 
 free_queue_props:
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 6dea52a50be4..99f536f8a649 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -615,9 +615,6 @@ static int goya_early_init(struct hl_device *hdev)
 				"PCI strap is not configured correctly, PCI bus errors may occur\n");
 	}
 
-	dev_info(hdev->dev, "firmware-level security is %s\n",
-		hdev->asic_prop.fw_security_disabled ? "disabled" : "enabled");
-
 	return 0;
 
 free_queue_props:
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] habanalabs: reset device upon fw read failure
  2020-11-04 14:08 [PATCH] habanalabs: add 'needs reset' state in driver Oded Gabbay
                   ` (5 preceding siblings ...)
  2020-11-04 14:09 ` [PATCH] habanalabs: remove duplicate print Oded Gabbay
@ 2020-11-04 14:09 ` Oded Gabbay
  2020-11-04 14:09 ` [PATCH] habanalabs: restore vm_pgoff after mmap Oded Gabbay
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2020-11-04 14:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: SW_Drivers, farah kassabri

From: farah kassabri <fkassabri@habana.ai>

failure in reading pre-boot verion is not handled correctly,
upon failure we need to reset the device in order to be able
to reinstall the driver.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c |  4 ++-
 drivers/misc/habanalabs/common/habanalabs.h  |  2 +-
 drivers/misc/habanalabs/common/pci.c         |  5 +++-
 drivers/misc/habanalabs/gaudi/gaudi.c        | 27 ++++++++++++--------
 drivers/misc/habanalabs/goya/goya.c          | 22 ++++++++++------
 5 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index fb9d901d5059..2fc12e529241 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -607,7 +607,9 @@ int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg,
 		return -EIO;
 	}
 
-	hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_PREBOOT);
+	rc = hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_PREBOOT);
+	if (rc)
+		return rc;
 
 	security_status = RREG32(cpu_security_boot_status_reg);
 
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 9f5a3c07f421..227f1a9552a6 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -927,7 +927,7 @@ struct hl_asic_funcs {
 	void (*ctx_fini)(struct hl_ctx *ctx);
 	int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk);
 	u32 (*get_queue_id_for_cq)(struct hl_device *hdev, u32 cq_idx);
-	void (*read_device_fw_version)(struct hl_device *hdev,
+	int (*read_device_fw_version)(struct hl_device *hdev,
 					enum hl_fw_component fwc);
 	int (*load_firmware_to_device)(struct hl_device *hdev);
 	int (*load_boot_fit_to_device)(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/common/pci.c b/drivers/misc/habanalabs/common/pci.c
index 211f3190f8d7..02152d85cf19 100644
--- a/drivers/misc/habanalabs/common/pci.c
+++ b/drivers/misc/habanalabs/common/pci.c
@@ -390,8 +390,11 @@ int hl_pci_init(struct hl_device *hdev, u32 cpu_boot_status_reg,
 	rc = hl_fw_read_preboot_status(hdev, cpu_boot_status_reg,
 			cpu_security_boot_status_reg, boot_err0_reg,
 			preboot_ver_timeout);
-	if (rc)
+	if (rc) {
+		dev_err(hdev->dev, "Failed to read preboot version\n");
+		hdev->asic_funcs->hw_fini(hdev, true);
 		goto unmap_pci_bars;
+	}
 
 	return 0;
 
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index ae73ffb4ad64..8abe46b8d615 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -3615,7 +3615,7 @@ static int gaudi_load_boot_fit_to_device(struct hl_device *hdev)
 	return hl_fw_load_fw_to_device(hdev, GAUDI_BOOT_FIT_FILE, dst, 0, 0);
 }
 
-static void gaudi_read_device_fw_version(struct hl_device *hdev,
+static int gaudi_read_device_fw_version(struct hl_device *hdev,
 					enum hl_fw_component fwc)
 {
 	const char *name;
@@ -3635,7 +3635,7 @@ static void gaudi_read_device_fw_version(struct hl_device *hdev,
 		break;
 	default:
 		dev_warn(hdev->dev, "Undefined FW component: %d\n", fwc);
-		return;
+		return -EIO;
 	}
 
 	ver_off &= ~((u32)SRAM_BASE_ADDR);
@@ -3647,7 +3647,10 @@ static void gaudi_read_device_fw_version(struct hl_device *hdev,
 		dev_err(hdev->dev, "%s version offset (0x%x) is above SRAM\n",
 								name, ver_off);
 		strcpy(dest, "unavailable");
+		return -EIO;
 	}
+
+	return 0;
 }
 
 static int gaudi_init_cpu(struct hl_device *hdev)
@@ -3937,16 +3940,18 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
 
 	WREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS, boot_strap);
 
-	gaudi->hw_cap_initialized &= ~(HW_CAP_CPU | HW_CAP_CPU_Q |
-					HW_CAP_HBM | HW_CAP_PCI_DMA |
-					HW_CAP_MME | HW_CAP_TPC_MASK |
-					HW_CAP_HBM_DMA | HW_CAP_PLL |
-					HW_CAP_NIC_MASK | HW_CAP_MMU |
-					HW_CAP_SRAM_SCRAMBLER |
-					HW_CAP_HBM_SCRAMBLER |
-					HW_CAP_CLK_GATE);
+	if (gaudi) {
+		gaudi->hw_cap_initialized &= ~(HW_CAP_CPU | HW_CAP_CPU_Q |
+				HW_CAP_HBM | HW_CAP_PCI_DMA |
+				HW_CAP_MME | HW_CAP_TPC_MASK |
+				HW_CAP_HBM_DMA | HW_CAP_PLL |
+				HW_CAP_NIC_MASK | HW_CAP_MMU |
+				HW_CAP_SRAM_SCRAMBLER |
+				HW_CAP_HBM_SCRAMBLER |
+				HW_CAP_CLK_GATE);
 
-	memset(gaudi->events_stat, 0, sizeof(gaudi->events_stat));
+		memset(gaudi->events_stat, 0, sizeof(gaudi->events_stat));
+	}
 }
 
 static int gaudi_suspend(struct hl_device *hdev)
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 1b81102d1bec..6dea52a50be4 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -2341,7 +2341,7 @@ static int goya_load_boot_fit_to_device(struct hl_device *hdev)
  * FW component passes an offset from SRAM_BASE_ADDR in SCRATCHPAD_xx.
  * The version string should be located by that offset.
  */
-static void goya_read_device_fw_version(struct hl_device *hdev,
+static int goya_read_device_fw_version(struct hl_device *hdev,
 					enum hl_fw_component fwc)
 {
 	const char *name;
@@ -2361,7 +2361,7 @@ static void goya_read_device_fw_version(struct hl_device *hdev,
 		break;
 	default:
 		dev_warn(hdev->dev, "Undefined FW component: %d\n", fwc);
-		return;
+		return -EIO;
 	}
 
 	ver_off &= ~((u32)SRAM_BASE_ADDR);
@@ -2373,7 +2373,11 @@ static void goya_read_device_fw_version(struct hl_device *hdev,
 		dev_err(hdev->dev, "%s version offset (0x%x) is above SRAM\n",
 								name, ver_off);
 		strcpy(dest, "unavailable");
+
+		return -EIO;
 	}
+
+	return 0;
 }
 
 static int goya_init_cpu(struct hl_device *hdev)
@@ -2644,12 +2648,14 @@ static void goya_hw_fini(struct hl_device *hdev, bool hard_reset)
 	WREG32(mmPSOC_GLOBAL_CONF_SW_BTM_FSM,
 			0xA << PSOC_GLOBAL_CONF_SW_BTM_FSM_CTRL_SHIFT);
 
-	goya->hw_cap_initialized &= ~(HW_CAP_CPU | HW_CAP_CPU_Q |
-					HW_CAP_DDR_0 | HW_CAP_DDR_1 |
-					HW_CAP_DMA | HW_CAP_MME |
-					HW_CAP_MMU | HW_CAP_TPC_MBIST |
-					HW_CAP_GOLDEN | HW_CAP_TPC);
-	memset(goya->events_stat, 0, sizeof(goya->events_stat));
+	if (goya) {
+		goya->hw_cap_initialized &= ~(HW_CAP_CPU | HW_CAP_CPU_Q |
+				HW_CAP_DDR_0 | HW_CAP_DDR_1 |
+				HW_CAP_DMA | HW_CAP_MME |
+				HW_CAP_MMU | HW_CAP_TPC_MBIST |
+				HW_CAP_GOLDEN | HW_CAP_TPC);
+		memset(goya->events_stat, 0, sizeof(goya->events_stat));
+	}
 }
 
 int goya_suspend(struct hl_device *hdev)
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] habanalabs: restore vm_pgoff after mmap
  2020-11-04 14:08 [PATCH] habanalabs: add 'needs reset' state in driver Oded Gabbay
                   ` (6 preceding siblings ...)
  2020-11-04 14:09 ` [PATCH] habanalabs: reset device upon fw read failure Oded Gabbay
@ 2020-11-04 14:09 ` Oded Gabbay
  2020-11-04 14:09 ` [PATCH] habanalabs: Separate CS job completion from its deallocation Oded Gabbay
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2020-11-04 14:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: SW_Drivers

Due to using dma_mmap_coherent() to perform mmap of dma memory, we
had to clear the vm_pgoff field before calling that function.

However, that broke the userspace (profiler tool) as they relied
on searching the /proc/self/maps for these values to correctly
"disassemble" the topology recipe.

To re-enable that functionality, the driver can simply restore the
value of vm_pgoff before returning to userspace but after calling
dma_mmap_coherent().

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_buffer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c
index 360dc340b2f4..b30249e4d3b7 100644
--- a/drivers/misc/habanalabs/common/command_buffer.c
+++ b/drivers/misc/habanalabs/common/command_buffer.c
@@ -518,6 +518,7 @@ int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
 	}
 
 	cb->mmap_size = cb->size;
+	vma->vm_pgoff = handle;
 
 	return 0;
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] habanalabs: Separate CS job completion from its deallocation
  2020-11-04 14:08 [PATCH] habanalabs: add 'needs reset' state in driver Oded Gabbay
                   ` (7 preceding siblings ...)
  2020-11-04 14:09 ` [PATCH] habanalabs: restore vm_pgoff after mmap Oded Gabbay
@ 2020-11-04 14:09 ` Oded Gabbay
  2020-11-04 14:09 ` [PATCH] habanalabs: Skip updating CI of internal queues if not in use Oded Gabbay
  2020-11-04 14:09 ` [PATCH] habanalabs: Small refactoring of cs_do_release() Oded Gabbay
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2020-11-04 14:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: SW_Drivers, Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

Current CS jobs are no longer needed after their completion.
However, jobs of future workload might be in use even after they are
completed. To allow that, the patch adds a refcount to the job object,
and decouples its completion handling from its deallocation.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 25 ++++++++++++++-----
 drivers/misc/habanalabs/common/debugfs.c      | 13 +++++-----
 drivers/misc/habanalabs/common/habanalabs.h   |  2 ++
 3 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 6de4b161856c..662b19663839 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -155,6 +155,18 @@ static void cs_put(struct hl_cs *cs)
 	kref_put(&cs->refcount, cs_do_release);
 }
 
+static void cs_job_do_release(struct kref *ref)
+{
+	struct hl_cs_job *job = container_of(ref, struct hl_cs_job, refcount);
+
+	kfree(job);
+}
+
+static void cs_job_put(struct hl_cs_job *job)
+{
+	kref_put(&job->refcount, cs_job_do_release);
+}
+
 static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
 {
 	/*
@@ -227,7 +239,7 @@ static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
 	return rc;
 }
 
-static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
+static void complete_job(struct hl_device *hdev, struct hl_cs_job *job)
 {
 	struct hl_cs *cs = job->cs;
 
@@ -276,7 +288,7 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 			job->queue_type == QUEUE_TYPE_HW)
 		cs_put(cs);
 
-	kfree(job);
+	cs_job_put(job);
 }
 
 static void cs_do_release(struct kref *ref)
@@ -290,13 +302,13 @@ static void cs_do_release(struct kref *ref)
 	/*
 	 * Although if we reached here it means that all external jobs have
 	 * finished, because each one of them took refcnt to CS, we still
-	 * need to go over the internal jobs and free them. Otherwise, we
+	 * need to go over the internal jobs and complete them. Otherwise, we
 	 * will have leaked memory and what's worse, the CS object (and
 	 * potentially the CTX object) could be released, while the JOB
 	 * still holds a pointer to them (but no reference).
 	 */
 	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
-		free_job(hdev, job);
+		complete_job(hdev, job);
 
 	if (!cs->submitted) {
 		/* In case the wait for signal CS was submitted, the put occurs
@@ -507,7 +519,7 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
 	struct hl_cs_job *job, *tmp;
 
 	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
-		free_job(hdev, job);
+		complete_job(hdev, job);
 }
 
 void hl_cs_rollback_all(struct hl_device *hdev)
@@ -539,7 +551,7 @@ static void job_wq_completion(struct work_struct *work)
 	struct hl_device *hdev = cs->ctx->hdev;
 
 	/* job is no longer needed */
-	free_job(hdev, job);
+	complete_job(hdev, job);
 }
 
 static int validate_queue_index(struct hl_device *hdev,
@@ -647,6 +659,7 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
 	if (!job)
 		return NULL;
 
+	kref_init(&job->refcount);
 	job->queue_type = queue_type;
 	job->is_kernel_allocated_cb = is_kernel_allocated_cb;
 
diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 912ddfa360b1..b44193ec3d12 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -168,18 +168,19 @@ static int command_submission_jobs_show(struct seq_file *s, void *data)
 		if (first) {
 			first = false;
 			seq_puts(s, "\n");
-			seq_puts(s, " JOB ID   CS ID    CTX ASID   H/W Queue\n");
-			seq_puts(s, "---------------------------------------\n");
+			seq_puts(s, " JOB ID   CS ID    CTX ASID   JOB RefCnt   H/W Queue\n");
+			seq_puts(s, "----------------------------------------------------\n");
 		}
 		if (job->cs)
 			seq_printf(s,
-				"    %02d       %llu         %d         %d\n",
+				"   %02d      %llu        %d          %d           %d\n",
 				job->id, job->cs->sequence, job->cs->ctx->asid,
-				job->hw_queue_id);
+				kref_read(&job->refcount), job->hw_queue_id);
 		else
 			seq_printf(s,
-				"    %02d       0         %d         %d\n",
-				job->id, HL_KERNEL_ASID_ID, job->hw_queue_id);
+				"   %02d      0        %d          %d           %d\n",
+				job->id, HL_KERNEL_ASID_ID,
+				kref_read(&job->refcount), job->hw_queue_id);
 	}
 
 	spin_unlock(&dev_entry->cs_job_spinlock);
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 227f1a9552a6..2e3021adc824 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1142,6 +1142,7 @@ struct hl_cs {
  * @userptr_list: linked-list of userptr mappings that belong to this job and
  *			wait for completion.
  * @debugfs_list: node in debugfs list of command submission jobs.
+ * @refcount: reference counter for usage of the CS job.
  * @queue_type: the type of the H/W queue this job is submitted to.
  * @id: the id of this job inside a CS.
  * @hw_queue_id: the id of the H/W queue this job is submitted to.
@@ -1165,6 +1166,7 @@ struct hl_cs_job {
 	struct work_struct	finish_work;
 	struct list_head	userptr_list;
 	struct list_head	debugfs_list;
+	struct kref		refcount;
 	enum hl_queue_type	queue_type;
 	u32			id;
 	u32			hw_queue_id;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] habanalabs: Skip updating CI of internal queues if not in use
  2020-11-04 14:08 [PATCH] habanalabs: add 'needs reset' state in driver Oded Gabbay
                   ` (8 preceding siblings ...)
  2020-11-04 14:09 ` [PATCH] habanalabs: Separate CS job completion from its deallocation Oded Gabbay
@ 2020-11-04 14:09 ` Oded Gabbay
  2020-11-04 14:09 ` [PATCH] habanalabs: Small refactoring of cs_do_release() Oded Gabbay
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2020-11-04 14:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: SW_Drivers, Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

There are no internal queues if H/W queues are being used.
In this case we can skip the redundant traversal over the queues array,
looking for internal queues.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/hw_queue.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c
index 239e2ba0545f..84a7458f3363 100644
--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -48,6 +48,11 @@ void hl_int_hw_queue_update_ci(struct hl_cs *cs)
 		return;
 
 	q = &hdev->kernel_queues[0];
+
+	/* There are no internal queues if H/W queues are being used */
+	if (!hdev->asic_prop.max_queues || q->queue_type == QUEUE_TYPE_HW)
+		return;
+
 	for (i = 0 ; i < hdev->asic_prop.max_queues ; i++, q++) {
 		if (q->queue_type == QUEUE_TYPE_INT)
 			atomic_add(cs->jobs_in_queue_cnt[i], &q->ci);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] habanalabs: Small refactoring of cs_do_release()
  2020-11-04 14:08 [PATCH] habanalabs: add 'needs reset' state in driver Oded Gabbay
                   ` (9 preceding siblings ...)
  2020-11-04 14:09 ` [PATCH] habanalabs: Skip updating CI of internal queues if not in use Oded Gabbay
@ 2020-11-04 14:09 ` Oded Gabbay
  10 siblings, 0 replies; 12+ messages in thread
From: Oded Gabbay @ 2020-11-04 14:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: SW_Drivers, Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

Slightly refactor the cs_do_release() function, to reduce nesting level
and to ease the handling of future CS types.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 99 +++++++++----------
 1 file changed, 49 insertions(+), 50 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 3e6f4e5ef7ec..18cd20eec4c5 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -281,8 +281,7 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 
 static void cs_do_release(struct kref *ref)
 {
-	struct hl_cs *cs = container_of(ref, struct hl_cs,
-						refcount);
+	struct hl_cs *cs = container_of(ref, struct hl_cs, refcount);
 	struct hl_device *hdev = cs->ctx->hdev;
 	struct hl_cs_job *job, *tmp;
 
@@ -299,69 +298,69 @@ static void cs_do_release(struct kref *ref)
 	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
 		free_job(hdev, job);
 
-	/* We also need to update CI for internal queues */
-	if (cs->submitted) {
-		hdev->asic_funcs->hw_queues_lock(hdev);
+	if (!cs->submitted) {
+		/* In case the wait for signal CS was submitted, the put occurs
+		 * in init_signal_wait_cs() right before hanging on the PQ.
+		 */
+		if (cs->type == CS_TYPE_WAIT)
+			hl_fence_put(cs->signal_fence);
 
-		hdev->cs_active_cnt--;
-		if (!hdev->cs_active_cnt) {
-			struct hl_device_idle_busy_ts *ts;
+		goto out;
+	}
 
-			ts = &hdev->idle_busy_ts_arr[hdev->idle_busy_ts_idx++];
-			ts->busy_to_idle_ts = ktime_get();
+	hdev->asic_funcs->hw_queues_lock(hdev);
 
-			if (hdev->idle_busy_ts_idx == HL_IDLE_BUSY_TS_ARR_SIZE)
-				hdev->idle_busy_ts_idx = 0;
-		} else if (hdev->cs_active_cnt < 0) {
-			dev_crit(hdev->dev, "CS active cnt %d is negative\n",
-				hdev->cs_active_cnt);
-		}
+	hdev->cs_active_cnt--;
+	if (!hdev->cs_active_cnt) {
+		struct hl_device_idle_busy_ts *ts;
 
-		hdev->asic_funcs->hw_queues_unlock(hdev);
+		ts = &hdev->idle_busy_ts_arr[hdev->idle_busy_ts_idx++];
+		ts->busy_to_idle_ts = ktime_get();
 
-		hl_int_hw_queue_update_ci(cs);
+		if (hdev->idle_busy_ts_idx == HL_IDLE_BUSY_TS_ARR_SIZE)
+			hdev->idle_busy_ts_idx = 0;
+	} else if (hdev->cs_active_cnt < 0) {
+		dev_crit(hdev->dev, "CS active cnt %d is negative\n",
+			hdev->cs_active_cnt);
+	}
 
-		spin_lock(&hdev->hw_queues_mirror_lock);
-		/* remove CS from hw_queues mirror list */
-		list_del_init(&cs->mirror_node);
-		spin_unlock(&hdev->hw_queues_mirror_lock);
+	hdev->asic_funcs->hw_queues_unlock(hdev);
 
-		/*
-		 * Don't cancel TDR in case this CS was timedout because we
-		 * might be running from the TDR context
-		 */
-		if ((!cs->timedout) &&
-			(hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT)) {
-			struct hl_cs *next;
+	/* Need to update CI for internal queues */
+	hl_int_hw_queue_update_ci(cs);
 
-			if (cs->tdr_active)
-				cancel_delayed_work_sync(&cs->work_tdr);
+	spin_lock(&hdev->hw_queues_mirror_lock);
+	/* remove CS from hw_queues mirror list */
+	list_del_init(&cs->mirror_node);
+	spin_unlock(&hdev->hw_queues_mirror_lock);
 
-			spin_lock(&hdev->hw_queues_mirror_lock);
+	/* Don't cancel TDR in case this CS was timedout because we might be
+	 * running from the TDR context
+	 */
+	if (!cs->timedout &&
+			hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT) {
+		struct hl_cs *next;
 
-			/* queue TDR for next CS */
-			next = list_first_entry_or_null(
-					&hdev->hw_queues_mirror_list,
-					struct hl_cs, mirror_node);
+		if (cs->tdr_active)
+			cancel_delayed_work_sync(&cs->work_tdr);
 
-			if ((next) && (!next->tdr_active)) {
-				next->tdr_active = true;
-				schedule_delayed_work(&next->work_tdr,
-							hdev->timeout_jiffies);
-			}
+		spin_lock(&hdev->hw_queues_mirror_lock);
+
+		/* queue TDR for next CS */
+		next = list_first_entry_or_null(&hdev->hw_queues_mirror_list,
+						struct hl_cs, mirror_node);
 
-			spin_unlock(&hdev->hw_queues_mirror_lock);
+		if (next && !next->tdr_active) {
+			next->tdr_active = true;
+			schedule_delayed_work(&next->work_tdr,
+						hdev->timeout_jiffies);
 		}
-	} else if (cs->type == CS_TYPE_WAIT) {
-		/*
-		 * In case the wait for signal CS was submitted, the put occurs
-		 * in init_signal_wait_cs() right before hanging on the PQ.
-		 */
-		hl_fence_put(cs->signal_fence);
+
+		spin_unlock(&hdev->hw_queues_mirror_lock);
 	}
 
-	/*
-	 * Must be called before hl_ctx_put because inside we use ctx to get
+out:
+	/* Must be called before hl_ctx_put because inside we use ctx to get
 	 * the device
 	 */
 	hl_debugfs_remove_cs(cs);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2020-11-04 14:10 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-04 14:08 [PATCH] habanalabs: add 'needs reset' state in driver Oded Gabbay
2020-11-04 14:08 ` [PATCH] habanalabs: fix hard reset print and comment Oded Gabbay
2020-11-04 14:08 ` [PATCH] habanalabs/gaudi: increase MAX CS to 16K Oded Gabbay
2020-11-04 14:09 ` [PATCH] habanalabs/gaudi: remove pcie_en strap toggle Oded Gabbay
2020-11-04 14:09 ` [PATCH] habanalabs: Move repeatedly included headers to habanalabs.h Oded Gabbay
2020-11-04 14:09 ` [PATCH] habanalabs: release signal if collective wait was dropped Oded Gabbay
2020-11-04 14:09 ` [PATCH] habanalabs: remove duplicate print Oded Gabbay
2020-11-04 14:09 ` [PATCH] habanalabs: reset device upon fw read failure Oded Gabbay
2020-11-04 14:09 ` [PATCH] habanalabs: restore vm_pgoff after mmap Oded Gabbay
2020-11-04 14:09 ` [PATCH] habanalabs: Separate CS job completion from its deallocation Oded Gabbay
2020-11-04 14:09 ` [PATCH] habanalabs: Skip updating CI of internal queues if not in use Oded Gabbay
2020-11-04 14:09 ` [PATCH] habanalabs: Small refactoring of cs_do_release() Oded Gabbay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).