All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4 0/3] crypto: hisilicon - supports device isolation feature
@ 2022-06-23  6:14 Kai Ye
  2022-06-23  6:14 ` [PATCH v4 1/3] uacce: " Kai Ye
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Kai Ye @ 2022-06-23  6:14 UTC (permalink / raw)
  To: gregkh, herbert
  Cc: linux-crypto, linux-accelerators, linux-kernel, linuxarm,
	zhangfei.gao, wangzhou1, yekai13

1、Add the uacce hardware error isolation interface. 
2、Add related implementation in ACC driver to support uacce interface.
   e.g. Defining the isolation strategy for ACC by uacce sysfs node, if
   the AER error frequency exceeds the value of setting for a certain 
   period of time. The device will not be available in user space. The
   VF device use the PF device isolation strategy. as well as the 
   isolation strategy should not be set during device use.
   
changes v1->v2:
	1、deleted dev_to_uacce api.
	2、add vfs node doc. 
	3、move uacce->ref to driver.
changes v2->v3:
	1、deleted some redundant code.
	2、use qm state instead of reference count.
	3、add null pointer check.
	4、isolate_strategy_read() instead of a copy.
changes v3->v4:
	modify a comment

Kai Ye (3):
  uacce: supports device isolation feature
  Documentation: add a isolation strategy sysfs node for uacce
  crypto: hisilicon/qm - defining the device isolation strategy

 Documentation/ABI/testing/sysfs-driver-uacce |  18 ++
 drivers/crypto/hisilicon/qm.c                | 177 +++++++++++++++++--
 drivers/misc/uacce/uacce.c                   |  41 +++++
 include/linux/hisi_acc_qm.h                  |   9 +
 include/linux/uacce.h                        |  11 ++
 5 files changed, 244 insertions(+), 12 deletions(-)

-- 
2.33.0


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v4 1/3] uacce: supports device isolation feature
  2022-06-23  6:14 [PATCH v4 0/3] crypto: hisilicon - supports device isolation feature Kai Ye
@ 2022-06-23  6:14 ` Kai Ye
  2022-06-23  6:14 ` [PATCH v4 2/3] Documentation: add a isolation strategy sysfs node for uacce Kai Ye
  2022-06-23  6:14 ` [PATCH v4 3/3] crypto: hisilicon/qm - defining the device isolation strategy Kai Ye
  2 siblings, 0 replies; 9+ messages in thread
From: Kai Ye @ 2022-06-23  6:14 UTC (permalink / raw)
  To: gregkh, herbert
  Cc: linux-crypto, linux-accelerators, linux-kernel, linuxarm,
	zhangfei.gao, wangzhou1, yekai13

UACCE adds the hardware error isolation API. Users can configure
the isolation method command by this sysfs node. This API interface
certainly supports the configuration of user protocol strategy. Then
parse it inside the device driver. UACCE only reports the device
isolate state. e.g. When the error frequency is exceeded, the device
will be isolated. The isolation strategy should be defined in each
driver module.

Signed-off-by: Kai Ye <yekai13@huawei.com>
---
 drivers/misc/uacce/uacce.c | 41 ++++++++++++++++++++++++++++++++++++++
 include/linux/uacce.h      | 11 ++++++++++
 2 files changed, 52 insertions(+)

diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c
index b6219c6bfb48..440144fea656 100644
--- a/drivers/misc/uacce/uacce.c
+++ b/drivers/misc/uacce/uacce.c
@@ -346,12 +346,51 @@ static ssize_t region_dus_size_show(struct device *dev,
 		       uacce->qf_pg_num[UACCE_QFRT_DUS] << PAGE_SHIFT);
 }
 
+static ssize_t isolate_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+
+	if (!uacce->ops->get_isolate_state)
+		return -ENODEV;
+
+	return sysfs_emit(buf, "%d\n", uacce->ops->get_isolate_state(uacce));
+}
+
+static ssize_t isolate_strategy_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+
+	if (!uacce->ops->isolate_strategy_read)
+		return -ENODEV;
+
+	return uacce->ops->isolate_strategy_read(uacce, buf);
+}
+
+static ssize_t isolate_strategy_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+	int ret;
+
+	if (!uacce->ops->isolate_strategy_write)
+		return -ENODEV;
+
+	ret = uacce->ops->isolate_strategy_write(uacce, buf, count);
+
+	return ret ? ret : count;
+}
+
 static DEVICE_ATTR_RO(api);
 static DEVICE_ATTR_RO(flags);
 static DEVICE_ATTR_RO(available_instances);
 static DEVICE_ATTR_RO(algorithms);
 static DEVICE_ATTR_RO(region_mmio_size);
 static DEVICE_ATTR_RO(region_dus_size);
+static DEVICE_ATTR_RO(isolate);
+static DEVICE_ATTR_RW(isolate_strategy);
 
 static struct attribute *uacce_dev_attrs[] = {
 	&dev_attr_api.attr,
@@ -360,6 +399,8 @@ static struct attribute *uacce_dev_attrs[] = {
 	&dev_attr_algorithms.attr,
 	&dev_attr_region_mmio_size.attr,
 	&dev_attr_region_dus_size.attr,
+	&dev_attr_isolate.attr,
+	&dev_attr_isolate_strategy.attr,
 	NULL,
 };
 
diff --git a/include/linux/uacce.h b/include/linux/uacce.h
index 48e319f40275..a535286d2753 100644
--- a/include/linux/uacce.h
+++ b/include/linux/uacce.h
@@ -30,6 +30,9 @@ struct uacce_qfile_region {
  * @is_q_updated: check whether the task is finished
  * @mmap: mmap addresses of queue to user space
  * @ioctl: ioctl for user space users of the queue
+ * @get_isolate_state: get the device state after set the isolate strategy
+ * @isolate_strategy_write: stored the isolate strategy to the device
+ * @isolate_strategy_read: read the isolate strategy from the device
  */
 struct uacce_ops {
 	int (*get_available_instances)(struct uacce_device *uacce);
@@ -43,6 +46,9 @@ struct uacce_ops {
 		    struct uacce_qfile_region *qfr);
 	long (*ioctl)(struct uacce_queue *q, unsigned int cmd,
 		      unsigned long arg);
+	enum uacce_dev_state (*get_isolate_state)(struct uacce_device *uacce);
+	int (*isolate_strategy_write)(struct uacce_device *uacce, const char *buf, size_t count);
+	int (*isolate_strategy_read)(struct uacce_device *uacce, char *buf);
 };
 
 /**
@@ -57,6 +63,11 @@ struct uacce_interface {
 	const struct uacce_ops *ops;
 };
 
+enum uacce_dev_state {
+	UACCE_DEV_NORMAL,
+	UACCE_DEV_ISOLATE,
+};
+
 enum uacce_q_state {
 	UACCE_Q_ZOMBIE = 0,
 	UACCE_Q_INIT,
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v4 2/3] Documentation: add a isolation strategy sysfs node for uacce
  2022-06-23  6:14 [PATCH v4 0/3] crypto: hisilicon - supports device isolation feature Kai Ye
  2022-06-23  6:14 ` [PATCH v4 1/3] uacce: " Kai Ye
@ 2022-06-23  6:14 ` Kai Ye
  2022-06-23  9:01   ` Greg KH
  2022-06-23  6:14 ` [PATCH v4 3/3] crypto: hisilicon/qm - defining the device isolation strategy Kai Ye
  2 siblings, 1 reply; 9+ messages in thread
From: Kai Ye @ 2022-06-23  6:14 UTC (permalink / raw)
  To: gregkh, herbert
  Cc: linux-crypto, linux-accelerators, linux-kernel, linuxarm,
	zhangfei.gao, wangzhou1, yekai13

Update documentation describing sysfs node that could help to
configure isolation method command for users in th user space.

Signed-off-by: Kai Ye <yekai13@huawei.com>
---
 Documentation/ABI/testing/sysfs-driver-uacce | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce
index 08f2591138af..8784efa96e01 100644
--- a/Documentation/ABI/testing/sysfs-driver-uacce
+++ b/Documentation/ABI/testing/sysfs-driver-uacce
@@ -19,6 +19,24 @@ Contact:        linux-accelerators@lists.ozlabs.org
 Description:    Available instances left of the device
                 Return -ENODEV if uacce_ops get_available_instances is not provided
 
+What:           /sys/class/uacce/<dev_name>/isolate_strategy
+Date:           Jun 2022
+KernelVersion:  5.20
+Contact:        linux-accelerators@lists.ozlabs.org
+Description:    A sysfs node that used to configures the hardware error
+                isolation method command. The command can be parsed
+                in correct driver. e.g. If the device slot reset frequency
+                exceeds the preset value in a time window, the device will be
+                isolated.
+
+What:           /sys/class/uacce/<dev_name>/isolate
+Date:           Jun 2022
+KernelVersion:  5.20
+Contact:        linux-accelerators@lists.ozlabs.org
+Description:    A sysfs node that show the device isolated state. The value 0
+                means that the device is working. The value 1 means that the
+                device has been isolated.
+
 What:           /sys/class/uacce/<dev_name>/algorithms
 Date:           Feb 2020
 KernelVersion:  5.7
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v4 3/3] crypto: hisilicon/qm - defining the device isolation strategy
  2022-06-23  6:14 [PATCH v4 0/3] crypto: hisilicon - supports device isolation feature Kai Ye
  2022-06-23  6:14 ` [PATCH v4 1/3] uacce: " Kai Ye
  2022-06-23  6:14 ` [PATCH v4 2/3] Documentation: add a isolation strategy sysfs node for uacce Kai Ye
@ 2022-06-23  6:14 ` Kai Ye
  2022-06-23  9:05   ` Greg KH
  2 siblings, 1 reply; 9+ messages in thread
From: Kai Ye @ 2022-06-23  6:14 UTC (permalink / raw)
  To: gregkh, herbert
  Cc: linux-crypto, linux-accelerators, linux-kernel, linuxarm,
	zhangfei.gao, wangzhou1, yekai13

Define the device isolation strategy by the device driver. The
user configures a frequency value by uacce interface. If the
slot reset frequency exceeds the value of setting for a certain
period of time, the device will not be available in user space.
This frequency is an abstract number of times that can be
considered to occur in a time window. The time window can be set
to one hour or one day. The VF device use the PF device isolation
strategy. All the hardware errors are processed by PF driver.

Signed-off-by: Kai Ye <yekai13@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 177 +++++++++++++++++++++++++++++++---
 include/linux/hisi_acc_qm.h   |   9 ++
 2 files changed, 174 insertions(+), 12 deletions(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index ad83c194d664..f92cf20fc84e 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -417,6 +417,16 @@ struct hisi_qm_resource {
 	struct list_head list;
 };
 
+/**
+ * struct qm_hw_err - structure of describes the device err
+ * @list: hardware error list
+ * @timestamp: timestamp when the error occurred
+ */
+struct qm_hw_err {
+	struct list_head list;
+	unsigned long long timestamp;
+};
+
 struct hisi_qm_hw_ops {
 	int (*get_vft)(struct hisi_qm *qm, u32 *base, u32 *number);
 	void (*qm_db)(struct hisi_qm *qm, u16 qn,
@@ -3410,6 +3420,125 @@ static long hisi_qm_uacce_ioctl(struct uacce_queue *q, unsigned int cmd,
 	return 0;
 }
 
+/**
+ * qm_hw_err_isolate() - Try to isolate the uacce device with its VFs
+ * according to user's configuration of isolation strategy. Warning: this
+ * API should be called while there the users on this device are suspended
+ * by slot resetting preparation of PCI AER.
+ * @qm: the uacce device
+ */
+static int qm_hw_err_isolate(struct hisi_qm *qm)
+{
+	struct qm_hw_err *err, *tmp, *hw_err;
+	struct qm_err_isolate *isolate;
+	u32 count = 0;
+
+	isolate = &qm->isolate_data;
+
+#define SECONDS_PER_HOUR	3600
+
+	/* All the hw errs are processed by PF driver */
+	if (qm->uacce->is_vf || atomic_read(&isolate->is_isolate) ||
+	    !isolate->hw_err_isolate_hz)
+		return 0;
+
+	hw_err = kzalloc(sizeof(*hw_err), GFP_ATOMIC);
+	if (!hw_err)
+		return -ENOMEM;
+
+	mutex_lock(&isolate->isolate_lock);
+	hw_err->timestamp = jiffies;
+	list_for_each_entry_safe(err, tmp, &qm->uacce_hw_errs, list) {
+		if ((hw_err->timestamp - err->timestamp) / HZ >
+		    SECONDS_PER_HOUR) {
+			list_del(&err->list);
+			kfree(err);
+		} else {
+			count++;
+		}
+	}
+	list_add(&hw_err->list, &qm->uacce_hw_errs);
+	mutex_unlock(&isolate->isolate_lock);
+
+	if (count >= isolate->hw_err_isolate_hz)
+		atomic_set(&isolate->is_isolate, 1);
+
+	return 0;
+}
+
+static void qm_hw_err_destroy(struct hisi_qm *qm)
+{
+	struct qm_hw_err *err, *tmp;
+
+	mutex_lock(&qm->isolate_data.isolate_lock);
+	list_for_each_entry_safe(err, tmp, &qm->uacce_hw_errs, list) {
+		list_del(&err->list);
+		kfree(err);
+	}
+	mutex_unlock(&qm->isolate_data.isolate_lock);
+}
+
+static enum uacce_dev_state hisi_qm_get_isolate_state(struct uacce_device *uacce)
+{
+	struct hisi_qm *qm = uacce->priv;
+	struct hisi_qm *pf_qm;
+
+	if (uacce->is_vf)
+		pf_qm = pci_get_drvdata(pci_physfn(qm->pdev));
+	else
+		pf_qm = qm;
+
+	return atomic_read(&pf_qm->isolate_data.is_isolate) ?
+			UACCE_DEV_ISOLATE : UACCE_DEV_NORMAL;
+}
+
+static int hisi_qm_isolate_strategy_write(struct uacce_device *uacce,
+					  const char *buf, size_t len)
+{
+	struct hisi_qm *qm = uacce->priv;
+	unsigned long val;
+
+#define MAX_ISOLATE_STRATEGY	65535
+
+	/* Must be set by PF */
+	if (uacce->is_vf) {
+		dev_info(&qm->pdev->dev, "the isolation strategy must be set by PF.\n");
+		return -EINVAL;
+	}
+
+	if (atomic_read(&qm->isolate_data.is_isolate))
+		return -EINVAL;
+
+	if (kstrtoul(buf, 0, &val) < 0)
+		return -EINVAL;
+
+	if (val > MAX_ISOLATE_STRATEGY)
+		return -EINVAL;
+
+	qm->isolate_data.hw_err_isolate_hz = val;
+
+	/* After the policy is updated, need to reset the hardware err list */
+	qm_hw_err_destroy(qm);
+
+	return 0;
+}
+
+static int hisi_qm_isolate_strategy_read(struct uacce_device *uacce, char *buf)
+{
+	struct hisi_qm *qm = uacce->priv;
+	struct hisi_qm *pf_qm;
+	unsigned long val;
+
+	if (uacce->is_vf) {
+		pf_qm = pci_get_drvdata(pci_physfn(qm->pdev));
+		val = pf_qm->isolate_data.hw_err_isolate_hz;
+	} else {
+		val = qm->isolate_data.hw_err_isolate_hz;
+	}
+
+	return sysfs_emit(buf, "%lu\n", val);
+}
+
 static const struct uacce_ops uacce_qm_ops = {
 	.get_available_instances = hisi_qm_get_available_instances,
 	.get_queue = hisi_qm_uacce_get_queue,
@@ -3419,8 +3548,22 @@ static const struct uacce_ops uacce_qm_ops = {
 	.mmap = hisi_qm_uacce_mmap,
 	.ioctl = hisi_qm_uacce_ioctl,
 	.is_q_updated = hisi_qm_is_q_updated,
+	.get_isolate_state = hisi_qm_get_isolate_state,
+	.isolate_strategy_write = hisi_qm_isolate_strategy_write,
+	.isolate_strategy_read = hisi_qm_isolate_strategy_read,
 };
 
+static void qm_remove_uacce(struct hisi_qm *qm)
+{
+	struct uacce_device *uacce = qm->uacce;
+
+	if (qm->use_sva) {
+		qm_hw_err_destroy(qm);
+		uacce_remove(uacce);
+		qm->uacce = NULL;
+	}
+}
+
 static int qm_alloc_uacce(struct hisi_qm *qm)
 {
 	struct pci_dev *pdev = qm->pdev;
@@ -3433,6 +3576,8 @@ static int qm_alloc_uacce(struct hisi_qm *qm)
 	};
 	int ret;
 
+	INIT_LIST_HEAD(&qm->uacce_hw_errs);
+	mutex_init(&qm->isolate_data.isolate_lock);
 	ret = strscpy(interface.name, dev_driver_string(&pdev->dev),
 		      sizeof(interface.name));
 	if (ret < 0)
@@ -3446,8 +3591,7 @@ static int qm_alloc_uacce(struct hisi_qm *qm)
 		qm->use_sva = true;
 	} else {
 		/* only consider sva case */
-		uacce_remove(uacce);
-		qm->uacce = NULL;
+		qm_remove_uacce(qm);
 		return -EINVAL;
 	}
 
@@ -5109,6 +5253,12 @@ static int qm_controller_reset_prepare(struct hisi_qm *qm)
 		return ret;
 	}
 
+	if (qm->use_sva) {
+		ret = qm_hw_err_isolate(qm);
+		if (ret)
+			pci_err(pdev, "failed to isolate hw err!\n");
+	}
+
 	ret = qm_wait_vf_prepare_finish(qm);
 	if (ret)
 		pci_err(pdev, "failed to stop by vfs in soft reset!\n");
@@ -5436,19 +5586,25 @@ static int qm_controller_reset(struct hisi_qm *qm)
 	ret = qm_soft_reset(qm);
 	if (ret) {
 		pci_err(pdev, "Controller reset failed (%d)\n", ret);
-		qm_reset_bit_clear(qm);
-		return ret;
+		goto err_reset;
 	}
 
 	ret = qm_controller_reset_done(qm);
-	if (ret) {
-		qm_reset_bit_clear(qm);
-		return ret;
-	}
+	if (ret)
+		goto err_reset;
 
 	pci_info(pdev, "Controller reset complete\n");
 
 	return 0;
+
+err_reset:
+	pci_err(pdev, "Controller reset failed (%d)\n", ret);
+	qm_reset_bit_clear(qm);
+
+	/* if resetting fails, isolate the device */
+	if (qm->use_sva && !qm->uacce->is_vf)
+		atomic_set(&qm->isolate_data.is_isolate, 1);
+	return ret;
 }
 
 /**
@@ -6246,10 +6402,7 @@ int hisi_qm_init(struct hisi_qm *qm)
 err_free_qm_memory:
 	hisi_qm_memory_uninit(qm);
 err_alloc_uacce:
-	if (qm->use_sva) {
-		uacce_remove(qm->uacce);
-		qm->uacce = NULL;
-	}
+	qm_remove_uacce(qm);
 err_irq_register:
 	qm_irq_unregister(qm);
 err_pci_init:
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 116e8bd68c99..44454150c205 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -271,6 +271,13 @@ struct hisi_qm_poll_data {
 	u16 *qp_finish_id;
 };
 
+struct qm_err_isolate {
+	struct mutex isolate_lock;
+	/* user cfg freq which triggers isolation */
+	u32 hw_err_isolate_hz;
+	atomic_t is_isolate;
+};
+
 struct hisi_qm {
 	enum qm_hw_ver ver;
 	enum qm_fun_type fun_type;
@@ -335,6 +342,8 @@ struct hisi_qm {
 	struct qm_shaper_factor *factor;
 	u32 mb_qos;
 	u32 type_rate;
+	struct list_head uacce_hw_errs;
+	struct qm_err_isolate isolate_data;
 };
 
 struct hisi_qp_status {
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 2/3] Documentation: add a isolation strategy sysfs node for uacce
  2022-06-23  6:14 ` [PATCH v4 2/3] Documentation: add a isolation strategy sysfs node for uacce Kai Ye
@ 2022-06-23  9:01   ` Greg KH
  2022-06-24  3:25     ` yekai(A)
  0 siblings, 1 reply; 9+ messages in thread
From: Greg KH @ 2022-06-23  9:01 UTC (permalink / raw)
  To: Kai Ye
  Cc: herbert, linux-crypto, linux-accelerators, linux-kernel,
	linuxarm, zhangfei.gao, wangzhou1

On Thu, Jun 23, 2022 at 02:14:51PM +0800, Kai Ye wrote:
> Update documentation describing sysfs node that could help to
> configure isolation method command for users in th user space.
> 
> Signed-off-by: Kai Ye <yekai13@huawei.com>
> ---
>  Documentation/ABI/testing/sysfs-driver-uacce | 18 ++++++++++++++++++
>  1 file changed, 18 insertions(+)
> 
> diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce
> index 08f2591138af..8784efa96e01 100644
> --- a/Documentation/ABI/testing/sysfs-driver-uacce
> +++ b/Documentation/ABI/testing/sysfs-driver-uacce
> @@ -19,6 +19,24 @@ Contact:        linux-accelerators@lists.ozlabs.org
>  Description:    Available instances left of the device
>                  Return -ENODEV if uacce_ops get_available_instances is not provided
>  
> +What:           /sys/class/uacce/<dev_name>/isolate_strategy
> +Date:           Jun 2022
> +KernelVersion:  5.20
> +Contact:        linux-accelerators@lists.ozlabs.org
> +Description:    A sysfs node that used to configures the hardware error
> +                isolation method command. The command can be parsed
> +                in correct driver. e.g. If the device slot reset frequency
> +                exceeds the preset value in a time window, the device will be
> +                isolated.

What is the "command"?  What is being parsed?  This needs to be
documented a lot more here, this is very vague and not obvious at all.


> +
> +What:           /sys/class/uacce/<dev_name>/isolate
> +Date:           Jun 2022
> +KernelVersion:  5.20
> +Contact:        linux-accelerators@lists.ozlabs.org
> +Description:    A sysfs node that show the device isolated state. The value 0
> +                means that the device is working. The value 1 means that the
> +                device has been isolated.

Are these read-only?  Write only?  read/write?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 3/3] crypto: hisilicon/qm - defining the device isolation strategy
  2022-06-23  6:14 ` [PATCH v4 3/3] crypto: hisilicon/qm - defining the device isolation strategy Kai Ye
@ 2022-06-23  9:05   ` Greg KH
  2022-06-27 12:09     ` yekai(A)
  0 siblings, 1 reply; 9+ messages in thread
From: Greg KH @ 2022-06-23  9:05 UTC (permalink / raw)
  To: Kai Ye
  Cc: herbert, linux-crypto, linux-accelerators, linux-kernel,
	linuxarm, zhangfei.gao, wangzhou1

On Thu, Jun 23, 2022 at 02:14:52PM +0800, Kai Ye wrote:
> Define the device isolation strategy by the device driver. The
> user configures a frequency value by uacce interface. If the
> slot reset frequency exceeds the value of setting for a certain
> period of time, the device will not be available in user space.
> This frequency is an abstract number of times that can be
> considered to occur in a time window. The time window can be set
> to one hour or one day. The VF device use the PF device isolation
> strategy. All the hardware errors are processed by PF driver.
> 
> Signed-off-by: Kai Ye <yekai13@huawei.com>
> ---
>  drivers/crypto/hisilicon/qm.c | 177 +++++++++++++++++++++++++++++++---
>  include/linux/hisi_acc_qm.h   |   9 ++
>  2 files changed, 174 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
> index ad83c194d664..f92cf20fc84e 100644
> --- a/drivers/crypto/hisilicon/qm.c
> +++ b/drivers/crypto/hisilicon/qm.c
> @@ -417,6 +417,16 @@ struct hisi_qm_resource {
>  	struct list_head list;
>  };
>  
> +/**
> + * struct qm_hw_err - structure of describes the device err
> + * @list: hardware error list
> + * @timestamp: timestamp when the error occurred
> + */
> +struct qm_hw_err {
> +	struct list_head list;
> +	unsigned long long timestamp;
> +};
> +
>  struct hisi_qm_hw_ops {
>  	int (*get_vft)(struct hisi_qm *qm, u32 *base, u32 *number);
>  	void (*qm_db)(struct hisi_qm *qm, u16 qn,
> @@ -3410,6 +3420,125 @@ static long hisi_qm_uacce_ioctl(struct uacce_queue *q, unsigned int cmd,
>  	return 0;
>  }
>  
> +/**
> + * qm_hw_err_isolate() - Try to isolate the uacce device with its VFs
> + * according to user's configuration of isolation strategy. Warning: this
> + * API should be called while there the users on this device are suspended
> + * by slot resetting preparation of PCI AER.
> + * @qm: the uacce device
> + */
> +static int qm_hw_err_isolate(struct hisi_qm *qm)
> +{
> +	struct qm_hw_err *err, *tmp, *hw_err;
> +	struct qm_err_isolate *isolate;
> +	u32 count = 0;
> +
> +	isolate = &qm->isolate_data;
> +
> +#define SECONDS_PER_HOUR	3600
> +
> +	/* All the hw errs are processed by PF driver */
> +	if (qm->uacce->is_vf || atomic_read(&isolate->is_isolate) ||
> +	    !isolate->hw_err_isolate_hz)
> +		return 0;
> +
> +	hw_err = kzalloc(sizeof(*hw_err), GFP_ATOMIC);
> +	if (!hw_err)
> +		return -ENOMEM;
> +
> +	mutex_lock(&isolate->isolate_lock);
> +	hw_err->timestamp = jiffies;
> +	list_for_each_entry_safe(err, tmp, &qm->uacce_hw_errs, list) {
> +		if ((hw_err->timestamp - err->timestamp) / HZ >
> +		    SECONDS_PER_HOUR) {
> +			list_del(&err->list);
> +			kfree(err);
> +		} else {
> +			count++;
> +		}
> +	}
> +	list_add(&hw_err->list, &qm->uacce_hw_errs);
> +	mutex_unlock(&isolate->isolate_lock);
> +
> +	if (count >= isolate->hw_err_isolate_hz)
> +		atomic_set(&isolate->is_isolate, 1);

Why is this an atomic value?  You can change it after it has been set,
and it can be modified after it is read.  A normal 'bool' would work
exactly the same.



> +
> +	return 0;
> +}
> +
> +static void qm_hw_err_destroy(struct hisi_qm *qm)
> +{
> +	struct qm_hw_err *err, *tmp;
> +
> +	mutex_lock(&qm->isolate_data.isolate_lock);
> +	list_for_each_entry_safe(err, tmp, &qm->uacce_hw_errs, list) {
> +		list_del(&err->list);
> +		kfree(err);
> +	}
> +	mutex_unlock(&qm->isolate_data.isolate_lock);
> +}
> +
> +static enum uacce_dev_state hisi_qm_get_isolate_state(struct uacce_device *uacce)
> +{
> +	struct hisi_qm *qm = uacce->priv;
> +	struct hisi_qm *pf_qm;
> +
> +	if (uacce->is_vf)
> +		pf_qm = pci_get_drvdata(pci_physfn(qm->pdev));
> +	else
> +		pf_qm = qm;
> +
> +	return atomic_read(&pf_qm->isolate_data.is_isolate) ?
> +			UACCE_DEV_ISOLATE : UACCE_DEV_NORMAL;
> +}
> +
> +static int hisi_qm_isolate_strategy_write(struct uacce_device *uacce,
> +					  const char *buf, size_t len)
> +{
> +	struct hisi_qm *qm = uacce->priv;
> +	unsigned long val;
> +
> +#define MAX_ISOLATE_STRATEGY	65535

What is this value?  Please document it better.

> +
> +	/* Must be set by PF */
> +	if (uacce->is_vf) {
> +		dev_info(&qm->pdev->dev, "the isolation strategy must be set by PF.\n");

Do not let userspace spam the kernel log for no good reason.

And why is this an info?

> +		return -EINVAL;

This looks like an error.

> +	}
> +
> +	if (atomic_read(&qm->isolate_data.is_isolate))
> +		return -EINVAL;

What happens if this changes right after reading it?

> +
> +	if (kstrtoul(buf, 0, &val) < 0)
> +		return -EINVAL;
> +
> +	if (val > MAX_ISOLATE_STRATEGY)
> +		return -EINVAL;
> +
> +	qm->isolate_data.hw_err_isolate_hz = val;
> +
> +	/* After the policy is updated, need to reset the hardware err list */
> +	qm_hw_err_destroy(qm);
> +
> +	return 0;

Don't you need to return the number of bytes read, not 0?

> +}
> +
> +static int hisi_qm_isolate_strategy_read(struct uacce_device *uacce, char *buf)
> +{
> +	struct hisi_qm *qm = uacce->priv;
> +	struct hisi_qm *pf_qm;
> +	unsigned long val;
> +
> +	if (uacce->is_vf) {
> +		pf_qm = pci_get_drvdata(pci_physfn(qm->pdev));
> +		val = pf_qm->isolate_data.hw_err_isolate_hz;
> +	} else {
> +		val = qm->isolate_data.hw_err_isolate_hz;
> +	}
> +
> +	return sysfs_emit(buf, "%lu\n", val);
> +}
> +
>  static const struct uacce_ops uacce_qm_ops = {
>  	.get_available_instances = hisi_qm_get_available_instances,
>  	.get_queue = hisi_qm_uacce_get_queue,
> @@ -3419,8 +3548,22 @@ static const struct uacce_ops uacce_qm_ops = {
>  	.mmap = hisi_qm_uacce_mmap,
>  	.ioctl = hisi_qm_uacce_ioctl,
>  	.is_q_updated = hisi_qm_is_q_updated,
> +	.get_isolate_state = hisi_qm_get_isolate_state,
> +	.isolate_strategy_write = hisi_qm_isolate_strategy_write,
> +	.isolate_strategy_read = hisi_qm_isolate_strategy_read,
>  };
>  
> +static void qm_remove_uacce(struct hisi_qm *qm)
> +{
> +	struct uacce_device *uacce = qm->uacce;
> +
> +	if (qm->use_sva) {
> +		qm_hw_err_destroy(qm);
> +		uacce_remove(uacce);
> +		qm->uacce = NULL;
> +	}
> +}
> +
>  static int qm_alloc_uacce(struct hisi_qm *qm)
>  {
>  	struct pci_dev *pdev = qm->pdev;
> @@ -3433,6 +3576,8 @@ static int qm_alloc_uacce(struct hisi_qm *qm)
>  	};
>  	int ret;
>  
> +	INIT_LIST_HEAD(&qm->uacce_hw_errs);
> +	mutex_init(&qm->isolate_data.isolate_lock);
>  	ret = strscpy(interface.name, dev_driver_string(&pdev->dev),
>  		      sizeof(interface.name));
>  	if (ret < 0)
> @@ -3446,8 +3591,7 @@ static int qm_alloc_uacce(struct hisi_qm *qm)
>  		qm->use_sva = true;
>  	} else {
>  		/* only consider sva case */
> -		uacce_remove(uacce);
> -		qm->uacce = NULL;
> +		qm_remove_uacce(qm);
>  		return -EINVAL;
>  	}
>  
> @@ -5109,6 +5253,12 @@ static int qm_controller_reset_prepare(struct hisi_qm *qm)
>  		return ret;
>  	}
>  
> +	if (qm->use_sva) {
> +		ret = qm_hw_err_isolate(qm);
> +		if (ret)
> +			pci_err(pdev, "failed to isolate hw err!\n");
> +	}
> +
>  	ret = qm_wait_vf_prepare_finish(qm);
>  	if (ret)
>  		pci_err(pdev, "failed to stop by vfs in soft reset!\n");
> @@ -5436,19 +5586,25 @@ static int qm_controller_reset(struct hisi_qm *qm)
>  	ret = qm_soft_reset(qm);
>  	if (ret) {
>  		pci_err(pdev, "Controller reset failed (%d)\n", ret);
> -		qm_reset_bit_clear(qm);
> -		return ret;
> +		goto err_reset;
>  	}
>  
>  	ret = qm_controller_reset_done(qm);
> -	if (ret) {
> -		qm_reset_bit_clear(qm);
> -		return ret;
> -	}
> +	if (ret)
> +		goto err_reset;
>  
>  	pci_info(pdev, "Controller reset complete\n");
>  
>  	return 0;
> +
> +err_reset:
> +	pci_err(pdev, "Controller reset failed (%d)\n", ret);
> +	qm_reset_bit_clear(qm);
> +
> +	/* if resetting fails, isolate the device */
> +	if (qm->use_sva && !qm->uacce->is_vf)
> +		atomic_set(&qm->isolate_data.is_isolate, 1);
> +	return ret;
>  }
>  
>  /**
> @@ -6246,10 +6402,7 @@ int hisi_qm_init(struct hisi_qm *qm)
>  err_free_qm_memory:
>  	hisi_qm_memory_uninit(qm);
>  err_alloc_uacce:
> -	if (qm->use_sva) {
> -		uacce_remove(qm->uacce);
> -		qm->uacce = NULL;
> -	}
> +	qm_remove_uacce(qm);
>  err_irq_register:
>  	qm_irq_unregister(qm);
>  err_pci_init:
> diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
> index 116e8bd68c99..44454150c205 100644
> --- a/include/linux/hisi_acc_qm.h
> +++ b/include/linux/hisi_acc_qm.h
> @@ -271,6 +271,13 @@ struct hisi_qm_poll_data {
>  	u16 *qp_finish_id;
>  };
>  
> +struct qm_err_isolate {
> +	struct mutex isolate_lock;
> +	/* user cfg freq which triggers isolation */
> +	u32 hw_err_isolate_hz;
> +	atomic_t is_isolate;

Again, why is this an atomic value?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 2/3] Documentation: add a isolation strategy sysfs node for uacce
  2022-06-23  9:01   ` Greg KH
@ 2022-06-24  3:25     ` yekai(A)
  2022-06-24  6:44       ` Greg KH
  0 siblings, 1 reply; 9+ messages in thread
From: yekai(A) @ 2022-06-24  3:25 UTC (permalink / raw)
  To: Greg KH
  Cc: herbert, linux-crypto, linux-accelerators, linux-kernel,
	linuxarm, zhangfei.gao, wangzhou1



On 2022/6/23 17:01, Greg KH wrote:
> On Thu, Jun 23, 2022 at 02:14:51PM +0800, Kai Ye wrote:
>> Update documentation describing sysfs node that could help to
>> configure isolation method command for users in th user space.
>>
>> Signed-off-by: Kai Ye <yekai13@huawei.com>
>> ---
>>  Documentation/ABI/testing/sysfs-driver-uacce | 18 ++++++++++++++++++
>>  1 file changed, 18 insertions(+)
>>
>> diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce
>> index 08f2591138af..8784efa96e01 100644
>> --- a/Documentation/ABI/testing/sysfs-driver-uacce
>> +++ b/Documentation/ABI/testing/sysfs-driver-uacce
>> @@ -19,6 +19,24 @@ Contact:        linux-accelerators@lists.ozlabs.org
>>  Description:    Available instances left of the device
>>                  Return -ENODEV if uacce_ops get_available_instances is not provided
>>
>> +What:           /sys/class/uacce/<dev_name>/isolate_strategy
>> +Date:           Jun 2022
>> +KernelVersion:  5.20
>> +Contact:        linux-accelerators@lists.ozlabs.org
>> +Description:    A sysfs node that used to configures the hardware error
>> +                isolation method command. The command can be parsed
>> +                in correct driver. e.g. If the device slot reset frequency
>> +                exceeds the preset value in a time window, the device will be
>> +                isolated.
>
> What is the "command"?  What is being parsed?  This needs to be
> documented a lot more here, this is very vague and not obvious at all.
>
>

This command is a string command issued by the user. After the command 
is configured, the acc driver parses the command.
>> +
>> +What:           /sys/class/uacce/<dev_name>/isolate
>> +Date:           Jun 2022
>> +KernelVersion:  5.20
>> +Contact:        linux-accelerators@lists.ozlabs.org
>> +Description:    A sysfs node that show the device isolated state. The value 0
>> +                means that the device is working. The value 1 means that the
>> +                device has been isolated.
>
> Are these read-only?  Write only?  read/write?

this node is read-only
>
> thanks,
>
> greg k-h
> .
>

Thanks
Kai

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 2/3] Documentation: add a isolation strategy sysfs node for uacce
  2022-06-24  3:25     ` yekai(A)
@ 2022-06-24  6:44       ` Greg KH
  0 siblings, 0 replies; 9+ messages in thread
From: Greg KH @ 2022-06-24  6:44 UTC (permalink / raw)
  To: yekai(A)
  Cc: herbert, linux-crypto, linux-accelerators, linux-kernel,
	linuxarm, zhangfei.gao, wangzhou1

On Fri, Jun 24, 2022 at 11:25:10AM +0800, yekai(A) wrote:
> 
> 
> On 2022/6/23 17:01, Greg KH wrote:
> > On Thu, Jun 23, 2022 at 02:14:51PM +0800, Kai Ye wrote:
> > > Update documentation describing sysfs node that could help to
> > > configure isolation method command for users in th user space.
> > > 
> > > Signed-off-by: Kai Ye <yekai13@huawei.com>
> > > ---
> > >  Documentation/ABI/testing/sysfs-driver-uacce | 18 ++++++++++++++++++
> > >  1 file changed, 18 insertions(+)
> > > 
> > > diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce
> > > index 08f2591138af..8784efa96e01 100644
> > > --- a/Documentation/ABI/testing/sysfs-driver-uacce
> > > +++ b/Documentation/ABI/testing/sysfs-driver-uacce
> > > @@ -19,6 +19,24 @@ Contact:        linux-accelerators@lists.ozlabs.org
> > >  Description:    Available instances left of the device
> > >                  Return -ENODEV if uacce_ops get_available_instances is not provided
> > > 
> > > +What:           /sys/class/uacce/<dev_name>/isolate_strategy
> > > +Date:           Jun 2022
> > > +KernelVersion:  5.20
> > > +Contact:        linux-accelerators@lists.ozlabs.org
> > > +Description:    A sysfs node that used to configures the hardware error
> > > +                isolation method command. The command can be parsed
> > > +                in correct driver. e.g. If the device slot reset frequency
> > > +                exceeds the preset value in a time window, the device will be
> > > +                isolated.
> > 
> > What is the "command"?  What is being parsed?  This needs to be
> > documented a lot more here, this is very vague and not obvious at all.
> > 
> > 
> 
> This command is a string command issued by the user. After the command is
> configured, the acc driver parses the command.

I am sorry, but I do not understand what you mean here.  What exactly is
a "command"?  What format is it in?  What are valid commands?  What are
invalid commands?  Are these commands different for different devices?
What do the commands do?  What are the return values for the commands?
And so on.

You are creating a new user/kernel API here and so you must define it
very specifically.  You have not speficied anything for us to know how
this works at all and so we can not accept this for that reason alone
(nor should you want us to.)

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 3/3] crypto: hisilicon/qm - defining the device isolation strategy
  2022-06-23  9:05   ` Greg KH
@ 2022-06-27 12:09     ` yekai(A)
  0 siblings, 0 replies; 9+ messages in thread
From: yekai(A) @ 2022-06-27 12:09 UTC (permalink / raw)
  To: Greg KH
  Cc: herbert, linux-crypto, linux-accelerators, linux-kernel,
	linuxarm, zhangfei.gao, wangzhou1



On 2022/6/23 17:05, Greg KH wrote:
> On Thu, Jun 23, 2022 at 02:14:52PM +0800, Kai Ye wrote:
>> Define the device isolation strategy by the device driver. The
>> user configures a frequency value by uacce interface. If the
>> slot reset frequency exceeds the value of setting for a certain
>> period of time, the device will not be available in user space.
>> This frequency is an abstract number of times that can be
>> considered to occur in a time window. The time window can be set
>> to one hour or one day. The VF device use the PF device isolation
>> strategy. All the hardware errors are processed by PF driver.
>>
>> Signed-off-by: Kai Ye <yekai13@huawei.com>
>> ---
>>  drivers/crypto/hisilicon/qm.c | 177 +++++++++++++++++++++++++++++++---
>>  include/linux/hisi_acc_qm.h   |   9 ++
>>  2 files changed, 174 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
>> index ad83c194d664..f92cf20fc84e 100644
>> --- a/drivers/crypto/hisilicon/qm.c
>> +++ b/drivers/crypto/hisilicon/qm.c
>> @@ -417,6 +417,16 @@ struct hisi_qm_resource {
>>  	struct list_head list;
>>  };
>>
>> +/**
>> + * struct qm_hw_err - structure of describes the device err
>> + * @list: hardware error list
>> + * @timestamp: timestamp when the error occurred
>> + */
>> +struct qm_hw_err {
>> +	struct list_head list;
>> +	unsigned long long timestamp;
>> +};
>> +
>>  struct hisi_qm_hw_ops {
>>  	int (*get_vft)(struct hisi_qm *qm, u32 *base, u32 *number);
>>  	void (*qm_db)(struct hisi_qm *qm, u16 qn,
>> @@ -3410,6 +3420,125 @@ static long hisi_qm_uacce_ioctl(struct uacce_queue *q, unsigned int cmd,
>>  	return 0;
>>  }
>>
>> +/**
>> + * qm_hw_err_isolate() - Try to isolate the uacce device with its VFs
>> + * according to user's configuration of isolation strategy. Warning: this
>> + * API should be called while there the users on this device are suspended
>> + * by slot resetting preparation of PCI AER.
>> + * @qm: the uacce device
>> + */
>> +static int qm_hw_err_isolate(struct hisi_qm *qm)
>> +{
>> +	struct qm_hw_err *err, *tmp, *hw_err;
>> +	struct qm_err_isolate *isolate;
>> +	u32 count = 0;
>> +
>> +	isolate = &qm->isolate_data;
>> +
>> +#define SECONDS_PER_HOUR	3600
>> +
>> +	/* All the hw errs are processed by PF driver */
>> +	if (qm->uacce->is_vf || atomic_read(&isolate->is_isolate) ||
>> +	    !isolate->hw_err_isolate_hz)
>> +		return 0;
>> +
>> +	hw_err = kzalloc(sizeof(*hw_err), GFP_ATOMIC);
>> +	if (!hw_err)
>> +		return -ENOMEM;
>> +
>> +	mutex_lock(&isolate->isolate_lock);
>> +	hw_err->timestamp = jiffies;
>> +	list_for_each_entry_safe(err, tmp, &qm->uacce_hw_errs, list) {
>> +		if ((hw_err->timestamp - err->timestamp) / HZ >
>> +		    SECONDS_PER_HOUR) {
>> +			list_del(&err->list);
>> +			kfree(err);
>> +		} else {
>> +			count++;
>> +		}
>> +	}
>> +	list_add(&hw_err->list, &qm->uacce_hw_errs);
>> +	mutex_unlock(&isolate->isolate_lock);
>> +
>> +	if (count >= isolate->hw_err_isolate_hz)
>> +		atomic_set(&isolate->is_isolate, 1);
>
> Why is this an atomic value?  You can change it after it has been set,
> and it can be modified after it is read.  A normal 'bool' would work
> exactly the same.
>
>
Yes A normal 'bool' is good.
>
>> +
>> +	return 0;
>> +}
>> +
>> +static void qm_hw_err_destroy(struct hisi_qm *qm)
>> +{
>> +	struct qm_hw_err *err, *tmp;
>> +
>> +	mutex_lock(&qm->isolate_data.isolate_lock);
>> +	list_for_each_entry_safe(err, tmp, &qm->uacce_hw_errs, list) {
>> +		list_del(&err->list);
>> +		kfree(err);
>> +	}
>> +	mutex_unlock(&qm->isolate_data.isolate_lock);
>> +}
>> +
>> +static enum uacce_dev_state hisi_qm_get_isolate_state(struct uacce_device *uacce)
>> +{
>> +	struct hisi_qm *qm = uacce->priv;
>> +	struct hisi_qm *pf_qm;
>> +
>> +	if (uacce->is_vf)
>> +		pf_qm = pci_get_drvdata(pci_physfn(qm->pdev));
>> +	else
>> +		pf_qm = qm;
>> +
>> +	return atomic_read(&pf_qm->isolate_data.is_isolate) ?
>> +			UACCE_DEV_ISOLATE : UACCE_DEV_NORMAL;
>> +}
>> +
>> +static int hisi_qm_isolate_strategy_write(struct uacce_device *uacce,
>> +					  const char *buf, size_t len)
>> +{
>> +	struct hisi_qm *qm = uacce->priv;
>> +	unsigned long val;
>> +
>> +#define MAX_ISOLATE_STRATEGY	65535
>
> What is this value?  Please document it better.
>

Specified maximum frequency, and will move this value to uacce.
>> +
>> +	/* Must be set by PF */
>> +	if (uacce->is_vf) {
>> +		dev_info(&qm->pdev->dev, "the isolation strategy must be set by PF.\n");
>
> Do not let userspace spam the kernel log for no good reason.
>
> And why is this an info?

delete it.
>
>> +		return -EINVAL;
>
> This looks like an error.
>
>> +	}
>> +
>> +	if (atomic_read(&qm->isolate_data.is_isolate))
>> +		return -EINVAL;
>
> What happens if this changes right after reading it?

It is meaningless to configure a count that has been isolated.
>
>> +
>> +	if (kstrtoul(buf, 0, &val) < 0)
>> +		return -EINVAL;
>> +
>> +	if (val > MAX_ISOLATE_STRATEGY)
>> +		return -EINVAL;
>> +
>> +	qm->isolate_data.hw_err_isolate_hz = val;
>> +
>> +	/* After the policy is updated, need to reset the hardware err list */
>> +	qm_hw_err_destroy(qm);
>> +
>> +	return 0;
>
> Don't you need to return the number of bytes read, not 0?
>
>> +}
>> +
>> +static int hisi_qm_isolate_strategy_read(struct uacce_device *uacce, char *buf)
>> +{
>> +	struct hisi_qm *qm = uacce->priv;
>> +	struct hisi_qm *pf_qm;
>> +	unsigned long val;
>> +
>> +	if (uacce->is_vf) {
>> +		pf_qm = pci_get_drvdata(pci_physfn(qm->pdev));
>> +		val = pf_qm->isolate_data.hw_err_isolate_hz;
>> +	} else {
>> +		val = qm->isolate_data.hw_err_isolate_hz;
>> +	}
>> +
>> +	return sysfs_emit(buf, "%lu\n", val);
>> +}
>> +
>>  static const struct uacce_ops uacce_qm_ops = {
>>  	.get_available_instances = hisi_qm_get_available_instances,
>>  	.get_queue = hisi_qm_uacce_get_queue,
>> @@ -3419,8 +3548,22 @@ static const struct uacce_ops uacce_qm_ops = {
>>  	.mmap = hisi_qm_uacce_mmap,
>>  	.ioctl = hisi_qm_uacce_ioctl,
>>  	.is_q_updated = hisi_qm_is_q_updated,
>> +	.get_isolate_state = hisi_qm_get_isolate_state,
>> +	.isolate_strategy_write = hisi_qm_isolate_strategy_write,
>> +	.isolate_strategy_read = hisi_qm_isolate_strategy_read,
>>  };
>>
>> +static void qm_remove_uacce(struct hisi_qm *qm)
>> +{
>> +	struct uacce_device *uacce = qm->uacce;
>> +
>> +	if (qm->use_sva) {
>> +		qm_hw_err_destroy(qm);
>> +		uacce_remove(uacce);
>> +		qm->uacce = NULL;
>> +	}
>> +}
>> +
>>  static int qm_alloc_uacce(struct hisi_qm *qm)
>>  {
>>  	struct pci_dev *pdev = qm->pdev;
>> @@ -3433,6 +3576,8 @@ static int qm_alloc_uacce(struct hisi_qm *qm)
>>  	};
>>  	int ret;
>>
>> +	INIT_LIST_HEAD(&qm->uacce_hw_errs);
>> +	mutex_init(&qm->isolate_data.isolate_lock);
>>  	ret = strscpy(interface.name, dev_driver_string(&pdev->dev),
>>  		      sizeof(interface.name));
>>  	if (ret < 0)
>> @@ -3446,8 +3591,7 @@ static int qm_alloc_uacce(struct hisi_qm *qm)
>>  		qm->use_sva = true;
>>  	} else {
>>  		/* only consider sva case */
>> -		uacce_remove(uacce);
>> -		qm->uacce = NULL;
>> +		qm_remove_uacce(qm);
>>  		return -EINVAL;
>>  	}
>>
>> @@ -5109,6 +5253,12 @@ static int qm_controller_reset_prepare(struct hisi_qm *qm)
>>  		return ret;
>>  	}
>>
>> +	if (qm->use_sva) {
>> +		ret = qm_hw_err_isolate(qm);
>> +		if (ret)
>> +			pci_err(pdev, "failed to isolate hw err!\n");
>> +	}
>> +
>>  	ret = qm_wait_vf_prepare_finish(qm);
>>  	if (ret)
>>  		pci_err(pdev, "failed to stop by vfs in soft reset!\n");
>> @@ -5436,19 +5586,25 @@ static int qm_controller_reset(struct hisi_qm *qm)
>>  	ret = qm_soft_reset(qm);
>>  	if (ret) {
>>  		pci_err(pdev, "Controller reset failed (%d)\n", ret);
>> -		qm_reset_bit_clear(qm);
>> -		return ret;
>> +		goto err_reset;
>>  	}
>>
>>  	ret = qm_controller_reset_done(qm);
>> -	if (ret) {
>> -		qm_reset_bit_clear(qm);
>> -		return ret;
>> -	}
>> +	if (ret)
>> +		goto err_reset;
>>
>>  	pci_info(pdev, "Controller reset complete\n");
>>
>>  	return 0;
>> +
>> +err_reset:
>> +	pci_err(pdev, "Controller reset failed (%d)\n", ret);
>> +	qm_reset_bit_clear(qm);
>> +
>> +	/* if resetting fails, isolate the device */
>> +	if (qm->use_sva && !qm->uacce->is_vf)
>> +		atomic_set(&qm->isolate_data.is_isolate, 1);
>> +	return ret;
>>  }
>>
>>  /**
>> @@ -6246,10 +6402,7 @@ int hisi_qm_init(struct hisi_qm *qm)
>>  err_free_qm_memory:
>>  	hisi_qm_memory_uninit(qm);
>>  err_alloc_uacce:
>> -	if (qm->use_sva) {
>> -		uacce_remove(qm->uacce);
>> -		qm->uacce = NULL;
>> -	}
>> +	qm_remove_uacce(qm);
>>  err_irq_register:
>>  	qm_irq_unregister(qm);
>>  err_pci_init:
>> diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
>> index 116e8bd68c99..44454150c205 100644
>> --- a/include/linux/hisi_acc_qm.h
>> +++ b/include/linux/hisi_acc_qm.h
>> @@ -271,6 +271,13 @@ struct hisi_qm_poll_data {
>>  	u16 *qp_finish_id;
>>  };
>>
>> +struct qm_err_isolate {
>> +	struct mutex isolate_lock;
>> +	/* user cfg freq which triggers isolation */
>> +	u32 hw_err_isolate_hz;
>> +	atomic_t is_isolate;
>
> Again, why is this an atomic value?

Yes, bool can be used.
>
> thanks,
>
> greg k-h
> .
>

thanks

Kai

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-06-27 12:10 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-23  6:14 [PATCH v4 0/3] crypto: hisilicon - supports device isolation feature Kai Ye
2022-06-23  6:14 ` [PATCH v4 1/3] uacce: " Kai Ye
2022-06-23  6:14 ` [PATCH v4 2/3] Documentation: add a isolation strategy sysfs node for uacce Kai Ye
2022-06-23  9:01   ` Greg KH
2022-06-24  3:25     ` yekai(A)
2022-06-24  6:44       ` Greg KH
2022-06-23  6:14 ` [PATCH v4 3/3] crypto: hisilicon/qm - defining the device isolation strategy Kai Ye
2022-06-23  9:05   ` Greg KH
2022-06-27 12:09     ` yekai(A)

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.