All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v5 0/3] crypto: hisilicon - supports device isolation feature
@ 2022-07-08  7:08 Kai Ye
  2022-07-08  7:08 ` [PATCH v5 1/3] uacce: " Kai Ye
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Kai Ye @ 2022-07-08  7:08 UTC (permalink / raw)
  To: gregkh, herbert; +Cc: linux-crypto, linux-kernel, wangzhou1, yekai13

1、Add the uacce hardware error isolation interface. Supports
   configures the hardware error isolation frequency.
2、Defining the isolation strategy for ACC by uacce sysfs node. If the 
   number of hardware errors in a per hour exceeds the configured value,
   the device will not be available in user space. The VF device use the
   PF device isolation strategy.
   
changes v1->v2:
	- deleted dev_to_uacce api.
	- add vfs node doc. 
	- move uacce->ref to driver.
changes v2->v3:
	- deleted some redundant code.
	- use qm state instead of reference count.
	- add null pointer check.
	- isolate_strategy_read() instead of a copy.
changes v3->v4:
	- modify a comment
changes v4->v5:
	- use bool instead of atomic.
	- isolation frequency instead of isolation command.

 Documentation/ABI/testing/sysfs-driver-uacce |  18 ++
 drivers/crypto/hisilicon/qm.c                | 163 +++++++++++++++++--
 drivers/misc/uacce/uacce.c                   |  55 +++++++
 include/linux/hisi_acc_qm.h                  |   9 +
 include/linux/uacce.h                        |  11 ++
 5 files changed, 244 insertions(+), 12 deletions(-)

-- 
2.33.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v5 1/3] uacce: supports device isolation feature
  2022-07-08  7:08 [PATCH v5 0/3] crypto: hisilicon - supports device isolation feature Kai Ye
@ 2022-07-08  7:08 ` Kai Ye
  2022-07-08  7:28   ` Greg KH
  2022-07-08  7:08 ` [PATCH v5 2/3] Documentation: add a isolation strategy sysfs node for uacce Kai Ye
  2022-07-08  7:08 ` [PATCH v5 3/3] crypto: hisilicon/qm - defining the device isolation strategy Kai Ye
  2 siblings, 1 reply; 14+ messages in thread
From: Kai Ye @ 2022-07-08  7:08 UTC (permalink / raw)
  To: gregkh, herbert; +Cc: linux-crypto, linux-kernel, wangzhou1, yekai13

UACCE adds the hardware error isolation API. Users can configure
the isolation frequency by this sysfs node. UACCE reports the device
isolate state to the user space. If the AER error frequency exceeds
the value of setting for a certain period of time, the device will be
isolated.

Signed-off-by: Kai Ye <yekai13@huawei.com>
---
 drivers/misc/uacce/uacce.c | 55 ++++++++++++++++++++++++++++++++++++++
 include/linux/uacce.h      | 11 ++++++++
 2 files changed, 66 insertions(+)

diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c
index 281c54003edc..d07b5f1f0596 100644
--- a/drivers/misc/uacce/uacce.c
+++ b/drivers/misc/uacce/uacce.c
@@ -7,6 +7,8 @@
 #include <linux/slab.h>
 #include <linux/uacce.h>
 
+#define MAX_ERR_ISOLATE_COUNT		65535
+
 static struct class *uacce_class;
 static dev_t uacce_devt;
 static DEFINE_MUTEX(uacce_mutex);
@@ -339,12 +341,63 @@ static ssize_t region_dus_size_show(struct device *dev,
 		       uacce->qf_pg_num[UACCE_QFRT_DUS] << PAGE_SHIFT);
 }
 
+static ssize_t isolate_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+
+	if (!uacce->ops->get_isolate_state)
+		return -ENODEV;
+
+	return sysfs_emit(buf, "%d\n", uacce->ops->get_isolate_state(uacce));
+}
+
+static ssize_t isolate_strategy_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+	u32 val;
+
+	if (!uacce->ops->isolate_strategy_read)
+		return -ENODEV;
+
+	val = uacce->ops->isolate_strategy_read(uacce);
+	if (val > MAX_ERR_ISOLATE_COUNT)
+		return -EINVAL;
+
+	return sysfs_emit(buf, "%u\n", val);
+}
+
+static ssize_t isolate_strategy_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+	unsigned long val;
+	int ret;
+
+	if (!uacce->ops->isolate_strategy_write)
+		return -ENODEV;
+
+	if (kstrtoul(buf, 0, &val) < 0)
+		return -EINVAL;
+
+	if (val > MAX_ERR_ISOLATE_COUNT)
+		return -EINVAL;
+
+	ret = uacce->ops->isolate_strategy_write(uacce, val);
+
+	return ret ? ret : count;
+}
+
 static DEVICE_ATTR_RO(api);
 static DEVICE_ATTR_RO(flags);
 static DEVICE_ATTR_RO(available_instances);
 static DEVICE_ATTR_RO(algorithms);
 static DEVICE_ATTR_RO(region_mmio_size);
 static DEVICE_ATTR_RO(region_dus_size);
+static DEVICE_ATTR_RO(isolate);
+static DEVICE_ATTR_RW(isolate_strategy);
 
 static struct attribute *uacce_dev_attrs[] = {
 	&dev_attr_api.attr,
@@ -353,6 +406,8 @@ static struct attribute *uacce_dev_attrs[] = {
 	&dev_attr_algorithms.attr,
 	&dev_attr_region_mmio_size.attr,
 	&dev_attr_region_dus_size.attr,
+	&dev_attr_isolate.attr,
+	&dev_attr_isolate_strategy.attr,
 	NULL,
 };
 
diff --git a/include/linux/uacce.h b/include/linux/uacce.h
index 48e319f40275..69e8f238d80c 100644
--- a/include/linux/uacce.h
+++ b/include/linux/uacce.h
@@ -30,6 +30,9 @@ struct uacce_qfile_region {
  * @is_q_updated: check whether the task is finished
  * @mmap: mmap addresses of queue to user space
  * @ioctl: ioctl for user space users of the queue
+ * @get_isolate_state: get the device state after set the isolate strategy
+ * @isolate_strategy_write: stored the isolate strategy to the device
+ * @isolate_strategy_read: read the isolate strategy value from the device
  */
 struct uacce_ops {
 	int (*get_available_instances)(struct uacce_device *uacce);
@@ -43,6 +46,9 @@ struct uacce_ops {
 		    struct uacce_qfile_region *qfr);
 	long (*ioctl)(struct uacce_queue *q, unsigned int cmd,
 		      unsigned long arg);
+	enum uacce_dev_state (*get_isolate_state)(struct uacce_device *uacce);
+	int (*isolate_strategy_write)(struct uacce_device *uacce, u32 freq);
+	u32 (*isolate_strategy_read)(struct uacce_device *uacce);
 };
 
 /**
@@ -57,6 +63,11 @@ struct uacce_interface {
 	const struct uacce_ops *ops;
 };
 
+enum uacce_dev_state {
+	UACCE_DEV_NORMAL,
+	UACCE_DEV_ISOLATE,
+};
+
 enum uacce_q_state {
 	UACCE_Q_ZOMBIE = 0,
 	UACCE_Q_INIT,
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v5 2/3] Documentation: add a isolation strategy sysfs node for uacce
  2022-07-08  7:08 [PATCH v5 0/3] crypto: hisilicon - supports device isolation feature Kai Ye
  2022-07-08  7:08 ` [PATCH v5 1/3] uacce: " Kai Ye
@ 2022-07-08  7:08 ` Kai Ye
  2022-07-08  7:30   ` Greg KH
  2022-07-08  7:30   ` Greg KH
  2022-07-08  7:08 ` [PATCH v5 3/3] crypto: hisilicon/qm - defining the device isolation strategy Kai Ye
  2 siblings, 2 replies; 14+ messages in thread
From: Kai Ye @ 2022-07-08  7:08 UTC (permalink / raw)
  To: gregkh, herbert; +Cc: linux-crypto, linux-kernel, wangzhou1, yekai13

Update documentation describing sysfs node that could help to
configure isolation strategy for users in the user space. And
describing sysfs node that could read the device isolated state.

Signed-off-by: Kai Ye <yekai13@huawei.com>
---
 Documentation/ABI/testing/sysfs-driver-uacce | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce
index 08f2591138af..a8056271a963 100644
--- a/Documentation/ABI/testing/sysfs-driver-uacce
+++ b/Documentation/ABI/testing/sysfs-driver-uacce
@@ -19,6 +19,24 @@ Contact:        linux-accelerators@lists.ozlabs.org
 Description:    Available instances left of the device
                 Return -ENODEV if uacce_ops get_available_instances is not provided
 
+What:           /sys/class/uacce/<dev_name>/isolate_strategy
+Date:           Jul 2022
+KernelVersion:  5.20
+Contact:        linux-accelerators@lists.ozlabs.org
+Description:    A sysfs node that used to configures the hardware error
+                isolation strategy. This strategy is a configured integer value.
+                The default is 0. The maximum value is 65535. This value
+                indicates the number of device slot resets per unit time
+                that your service can tolerate.
+
+What:           /sys/class/uacce/<dev_name>/isolate
+Date:           Jul 2022
+KernelVersion:  5.20
+Contact:        linux-accelerators@lists.ozlabs.org
+Description:    A sysfs node that read the device isolated state. The value 0
+                means that the device is working. The value 1 means that the
+                device has been isolated.
+
 What:           /sys/class/uacce/<dev_name>/algorithms
 Date:           Feb 2020
 KernelVersion:  5.7
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v5 3/3] crypto: hisilicon/qm - defining the device isolation strategy
  2022-07-08  7:08 [PATCH v5 0/3] crypto: hisilicon - supports device isolation feature Kai Ye
  2022-07-08  7:08 ` [PATCH v5 1/3] uacce: " Kai Ye
  2022-07-08  7:08 ` [PATCH v5 2/3] Documentation: add a isolation strategy sysfs node for uacce Kai Ye
@ 2022-07-08  7:08 ` Kai Ye
  2022-07-08  7:35   ` Greg KH
  2 siblings, 1 reply; 14+ messages in thread
From: Kai Ye @ 2022-07-08  7:08 UTC (permalink / raw)
  To: gregkh, herbert; +Cc: linux-crypto, linux-kernel, wangzhou1, yekai13

Define the device isolation strategy by the device driver. The
user configures a frequency value by uacce interface. If the
slot reset frequency exceeds the value of setting for a certain
period of time, the device will not be available in user space.
The time window is one hour. The VF device use the PF device
isolation strategy. All the hardware errors are processed by PF
driver. This solution can be used for other drivers.

Signed-off-by: Kai Ye <yekai13@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 163 +++++++++++++++++++++++++++++++---
 include/linux/hisi_acc_qm.h   |   9 ++
 2 files changed, 160 insertions(+), 12 deletions(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index ad83c194d664..8eb3b790a655 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -417,6 +417,16 @@ struct hisi_qm_resource {
 	struct list_head list;
 };
 
+/**
+ * struct qm_hw_err - Structure describing the device errors
+ * @list: hardware error list
+ * @timestamp: timestamp when the error occurred
+ */
+struct qm_hw_err {
+	struct list_head list;
+	unsigned long long timestamp;
+};
+
 struct hisi_qm_hw_ops {
 	int (*get_vft)(struct hisi_qm *qm, u32 *base, u32 *number);
 	void (*qm_db)(struct hisi_qm *qm, u16 qn,
@@ -3410,6 +3420,111 @@ static long hisi_qm_uacce_ioctl(struct uacce_queue *q, unsigned int cmd,
 	return 0;
 }
 
+/**
+ * qm_hw_err_isolate() - Try to isolate the uacce device with its VFs
+ * according to user's configuration of isolation strategy. Warning: this
+ * API should be called while there the users on this device are suspended
+ * by slot resetting preparation of PCI AER.
+ * @qm: the uacce device
+ */
+static int qm_hw_err_isolate(struct hisi_qm *qm)
+{
+	struct qm_hw_err *err, *tmp, *hw_err;
+	struct qm_err_isolate *isolate;
+	u32 count = 0;
+
+	isolate = &qm->isolate_data;
+
+#define SECONDS_PER_HOUR	3600
+
+	/* All the hw errs are processed by PF driver */
+	if (qm->uacce->is_vf || isolate->is_isolate ||
+	    !isolate->hw_err_isolate_hz)
+		return 0;
+
+	hw_err = kzalloc(sizeof(*hw_err), GFP_ATOMIC);
+	if (!hw_err)
+		return -ENOMEM;
+
+	mutex_lock(&isolate->isolate_lock);
+	hw_err->timestamp = jiffies;
+	list_for_each_entry_safe(err, tmp, &isolate->uacce_hw_errs, list) {
+		if ((hw_err->timestamp - err->timestamp) / HZ >
+		    SECONDS_PER_HOUR) {
+			list_del(&err->list);
+			kfree(err);
+		} else {
+			count++;
+		}
+	}
+	list_add(&hw_err->list, &isolate->uacce_hw_errs);
+	mutex_unlock(&isolate->isolate_lock);
+
+	if (count >= isolate->hw_err_isolate_hz)
+		isolate->is_isolate = true;
+
+	return 0;
+}
+
+static void qm_hw_err_destroy(struct hisi_qm *qm)
+{
+	struct qm_hw_err *err, *tmp;
+
+	mutex_lock(&qm->isolate_data.isolate_lock);
+	list_for_each_entry_safe(err, tmp, &qm->isolate_data.uacce_hw_errs, list) {
+		list_del(&err->list);
+		kfree(err);
+	}
+	mutex_unlock(&qm->isolate_data.isolate_lock);
+}
+
+static enum uacce_dev_state hisi_qm_get_isolate_state(struct uacce_device *uacce)
+{
+	struct hisi_qm *qm = uacce->priv;
+	struct hisi_qm *pf_qm;
+
+	if (uacce->is_vf)
+		pf_qm = pci_get_drvdata(pci_physfn(qm->pdev));
+	else
+		pf_qm = qm;
+
+	return pf_qm->isolate_data.is_isolate ?
+			UACCE_DEV_ISOLATE : UACCE_DEV_NORMAL;
+}
+
+static int hisi_qm_isolate_strategy_write(struct uacce_device *uacce,
+					  u32 freq)
+{
+	struct hisi_qm *qm = uacce->priv;
+
+	/* Must be set by PF */
+	if (uacce->is_vf)
+		return -EINVAL;
+
+	if (qm->isolate_data.is_isolate)
+		return -EINVAL;
+
+	qm->isolate_data.hw_err_isolate_hz = freq;
+
+	/* After the policy is updated, need to reset the hardware err list */
+	qm_hw_err_destroy(qm);
+
+	return 0;
+}
+
+static u32 hisi_qm_isolate_strategy_read(struct uacce_device *uacce)
+{
+	struct hisi_qm *qm = uacce->priv;
+	struct hisi_qm *pf_qm;
+
+	if (uacce->is_vf) {
+		pf_qm = pci_get_drvdata(pci_physfn(qm->pdev));
+		return pf_qm->isolate_data.hw_err_isolate_hz;
+	} else {
+		return qm->isolate_data.hw_err_isolate_hz;
+	}
+}
+
 static const struct uacce_ops uacce_qm_ops = {
 	.get_available_instances = hisi_qm_get_available_instances,
 	.get_queue = hisi_qm_uacce_get_queue,
@@ -3419,8 +3534,22 @@ static const struct uacce_ops uacce_qm_ops = {
 	.mmap = hisi_qm_uacce_mmap,
 	.ioctl = hisi_qm_uacce_ioctl,
 	.is_q_updated = hisi_qm_is_q_updated,
+	.get_isolate_state = hisi_qm_get_isolate_state,
+	.isolate_strategy_write = hisi_qm_isolate_strategy_write,
+	.isolate_strategy_read = hisi_qm_isolate_strategy_read,
 };
 
+static void qm_remove_uacce(struct hisi_qm *qm)
+{
+	struct uacce_device *uacce = qm->uacce;
+
+	if (qm->use_sva) {
+		qm_hw_err_destroy(qm);
+		uacce_remove(uacce);
+		qm->uacce = NULL;
+	}
+}
+
 static int qm_alloc_uacce(struct hisi_qm *qm)
 {
 	struct pci_dev *pdev = qm->pdev;
@@ -3446,8 +3575,7 @@ static int qm_alloc_uacce(struct hisi_qm *qm)
 		qm->use_sva = true;
 	} else {
 		/* only consider sva case */
-		uacce_remove(uacce);
-		qm->uacce = NULL;
+		qm_remove_uacce(qm);
 		return -EINVAL;
 	}
 
@@ -3479,6 +3607,8 @@ static int qm_alloc_uacce(struct hisi_qm *qm)
 	uacce->qf_pg_num[UACCE_QFRT_DUS]  = dus_page_nr;
 
 	qm->uacce = uacce;
+	INIT_LIST_HEAD(&qm->isolate_data.uacce_hw_errs);
+	mutex_init(&qm->isolate_data.isolate_lock);
 
 	return 0;
 }
@@ -5109,6 +5239,12 @@ static int qm_controller_reset_prepare(struct hisi_qm *qm)
 		return ret;
 	}
 
+	if (qm->use_sva) {
+		ret = qm_hw_err_isolate(qm);
+		if (ret)
+			pci_err(pdev, "failed to isolate hw err!\n");
+	}
+
 	ret = qm_wait_vf_prepare_finish(qm);
 	if (ret)
 		pci_err(pdev, "failed to stop by vfs in soft reset!\n");
@@ -5436,19 +5572,25 @@ static int qm_controller_reset(struct hisi_qm *qm)
 	ret = qm_soft_reset(qm);
 	if (ret) {
 		pci_err(pdev, "Controller reset failed (%d)\n", ret);
-		qm_reset_bit_clear(qm);
-		return ret;
+		goto err_reset;
 	}
 
 	ret = qm_controller_reset_done(qm);
-	if (ret) {
-		qm_reset_bit_clear(qm);
-		return ret;
-	}
+	if (ret)
+		goto err_reset;
 
 	pci_info(pdev, "Controller reset complete\n");
 
 	return 0;
+
+err_reset:
+	pci_err(pdev, "Controller reset failed (%d)\n", ret);
+	qm_reset_bit_clear(qm);
+
+	/* if resetting fails, isolate the device */
+	if (qm->use_sva && !qm->uacce->is_vf)
+		qm->isolate_data.is_isolate = true;
+	return ret;
 }
 
 /**
@@ -6246,10 +6388,7 @@ int hisi_qm_init(struct hisi_qm *qm)
 err_free_qm_memory:
 	hisi_qm_memory_uninit(qm);
 err_alloc_uacce:
-	if (qm->use_sva) {
-		uacce_remove(qm->uacce);
-		qm->uacce = NULL;
-	}
+	qm_remove_uacce(qm);
 err_irq_register:
 	qm_irq_unregister(qm);
 err_pci_init:
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 116e8bd68c99..e7aa6a451ec9 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -271,6 +271,14 @@ struct hisi_qm_poll_data {
 	u16 *qp_finish_id;
 };
 
+struct qm_err_isolate {
+	struct mutex isolate_lock;
+	/* user cfg freq which triggers isolation */
+	u32 hw_err_isolate_hz;
+	bool is_isolate;
+	struct list_head uacce_hw_errs;
+};
+
 struct hisi_qm {
 	enum qm_hw_ver ver;
 	enum qm_fun_type fun_type;
@@ -335,6 +343,7 @@ struct hisi_qm {
 	struct qm_shaper_factor *factor;
 	u32 mb_qos;
 	u32 type_rate;
+	struct qm_err_isolate isolate_data;
 };
 
 struct hisi_qp_status {
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v5 1/3] uacce: supports device isolation feature
  2022-07-08  7:08 ` [PATCH v5 1/3] uacce: " Kai Ye
@ 2022-07-08  7:28   ` Greg KH
  2022-07-08  9:33     ` yekai(A)
  0 siblings, 1 reply; 14+ messages in thread
From: Greg KH @ 2022-07-08  7:28 UTC (permalink / raw)
  To: Kai Ye; +Cc: herbert, linux-crypto, linux-kernel, wangzhou1

On Fri, Jul 08, 2022 at 03:08:18PM +0800, Kai Ye wrote:
> UACCE adds the hardware error isolation API. Users can configure
> the isolation frequency by this sysfs node. UACCE reports the device
> isolate state to the user space. If the AER error frequency exceeds
> the value of setting for a certain period of time, the device will be
> isolated.
> 
> Signed-off-by: Kai Ye <yekai13@huawei.com>
> ---
>  drivers/misc/uacce/uacce.c | 55 ++++++++++++++++++++++++++++++++++++++
>  include/linux/uacce.h      | 11 ++++++++
>  2 files changed, 66 insertions(+)
> 
> diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c
> index 281c54003edc..d07b5f1f0596 100644
> --- a/drivers/misc/uacce/uacce.c
> +++ b/drivers/misc/uacce/uacce.c
> @@ -7,6 +7,8 @@
>  #include <linux/slab.h>
>  #include <linux/uacce.h>
>  
> +#define MAX_ERR_ISOLATE_COUNT		65535
> +
>  static struct class *uacce_class;
>  static dev_t uacce_devt;
>  static DEFINE_MUTEX(uacce_mutex);
> @@ -339,12 +341,63 @@ static ssize_t region_dus_size_show(struct device *dev,
>  		       uacce->qf_pg_num[UACCE_QFRT_DUS] << PAGE_SHIFT);
>  }
>  
> +static ssize_t isolate_show(struct device *dev,
> +			    struct device_attribute *attr, char *buf)
> +{
> +	struct uacce_device *uacce = to_uacce_device(dev);
> +
> +	if (!uacce->ops->get_isolate_state)
> +		return -ENODEV;

If there is no callback, why is this sysfs even created at all?  Please
do not create it if it can not be accessed.

Use the is_visable() callback for the group to do this.

> +
> +	return sysfs_emit(buf, "%d\n", uacce->ops->get_isolate_state(uacce));
> +}
> +
> +static ssize_t isolate_strategy_show(struct device *dev,
> +				     struct device_attribute *attr, char *buf)
> +{
> +	struct uacce_device *uacce = to_uacce_device(dev);
> +	u32 val;
> +
> +	if (!uacce->ops->isolate_strategy_read)
> +		return -ENODEV;

Same here, don't have a sysfs file that does nothing.

> +
> +	val = uacce->ops->isolate_strategy_read(uacce);
> +	if (val > MAX_ERR_ISOLATE_COUNT)
> +		return -EINVAL;
> +
> +	return sysfs_emit(buf, "%u\n", val);
> +}
> +
> +static ssize_t isolate_strategy_store(struct device *dev,
> +				      struct device_attribute *attr,
> +				      const char *buf, size_t count)
> +{
> +	struct uacce_device *uacce = to_uacce_device(dev);
> +	unsigned long val;
> +	int ret;
> +
> +	if (!uacce->ops->isolate_strategy_write)
> +		return -ENODEV;

Same here.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5 2/3] Documentation: add a isolation strategy sysfs node for uacce
  2022-07-08  7:08 ` [PATCH v5 2/3] Documentation: add a isolation strategy sysfs node for uacce Kai Ye
@ 2022-07-08  7:30   ` Greg KH
  2022-07-08  9:38     ` yekai(A)
  2022-07-08  7:30   ` Greg KH
  1 sibling, 1 reply; 14+ messages in thread
From: Greg KH @ 2022-07-08  7:30 UTC (permalink / raw)
  To: Kai Ye; +Cc: herbert, linux-crypto, linux-kernel, wangzhou1

On Fri, Jul 08, 2022 at 03:08:19PM +0800, Kai Ye wrote:
> Update documentation describing sysfs node that could help to
> configure isolation strategy for users in the user space. And
> describing sysfs node that could read the device isolated state.
> 
> Signed-off-by: Kai Ye <yekai13@huawei.com>
> ---
>  Documentation/ABI/testing/sysfs-driver-uacce | 18 ++++++++++++++++++
>  1 file changed, 18 insertions(+)
> 
> diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce
> index 08f2591138af..a8056271a963 100644
> --- a/Documentation/ABI/testing/sysfs-driver-uacce
> +++ b/Documentation/ABI/testing/sysfs-driver-uacce
> @@ -19,6 +19,24 @@ Contact:        linux-accelerators@lists.ozlabs.org
>  Description:    Available instances left of the device
>                  Return -ENODEV if uacce_ops get_available_instances is not provided
>  
> +What:           /sys/class/uacce/<dev_name>/isolate_strategy
> +Date:           Jul 2022
> +KernelVersion:  5.20
> +Contact:        linux-accelerators@lists.ozlabs.org
> +Description:    A sysfs node that used to configures the hardware error

This is not a "node" it is just a file.


> +                isolation strategy. This strategy is a configured integer value.
> +                The default is 0. The maximum value is 65535. This value
> +                indicates the number of device slot resets per unit time
> +                that your service can tolerate.

I do not understand this, sorry.  What do you mean by "that your service
can tolerate"?

> +
> +What:           /sys/class/uacce/<dev_name>/isolate
> +Date:           Jul 2022
> +KernelVersion:  5.20
> +Contact:        linux-accelerators@lists.ozlabs.org
> +Description:    A sysfs node that read the device isolated state. The value 0
> +                means that the device is working. The value 1 means that the
> +                device has been isolated.

So 1 means "not working"?  This seems odd, perhaps you can rephrase this
a bit better?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5 2/3] Documentation: add a isolation strategy sysfs node for uacce
  2022-07-08  7:08 ` [PATCH v5 2/3] Documentation: add a isolation strategy sysfs node for uacce Kai Ye
  2022-07-08  7:30   ` Greg KH
@ 2022-07-08  7:30   ` Greg KH
  1 sibling, 0 replies; 14+ messages in thread
From: Greg KH @ 2022-07-08  7:30 UTC (permalink / raw)
  To: Kai Ye; +Cc: herbert, linux-crypto, linux-kernel, wangzhou1

On Fri, Jul 08, 2022 at 03:08:19PM +0800, Kai Ye wrote:
> Update documentation describing sysfs node that could help to
> configure isolation strategy for users in the user space. And
> describing sysfs node that could read the device isolated state.
> 
> Signed-off-by: Kai Ye <yekai13@huawei.com>
> ---
>  Documentation/ABI/testing/sysfs-driver-uacce | 18 ++++++++++++++++++
>  1 file changed, 18 insertions(+)
> 
> diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce
> index 08f2591138af..a8056271a963 100644
> --- a/Documentation/ABI/testing/sysfs-driver-uacce
> +++ b/Documentation/ABI/testing/sysfs-driver-uacce
> @@ -19,6 +19,24 @@ Contact:        linux-accelerators@lists.ozlabs.org
>  Description:    Available instances left of the device
>                  Return -ENODEV if uacce_ops get_available_instances is not provided
>  
> +What:           /sys/class/uacce/<dev_name>/isolate_strategy
> +Date:           Jul 2022
> +KernelVersion:  5.20
> +Contact:        linux-accelerators@lists.ozlabs.org
> +Description:    A sysfs node that used to configures the hardware error
> +                isolation strategy. This strategy is a configured integer value.
> +                The default is 0. The maximum value is 65535. This value
> +                indicates the number of device slot resets per unit time
> +                that your service can tolerate.
> +
> +What:           /sys/class/uacce/<dev_name>/isolate
> +Date:           Jul 2022
> +KernelVersion:  5.20
> +Contact:        linux-accelerators@lists.ozlabs.org
> +Description:    A sysfs node that read the device isolated state. The value 0
> +                means that the device is working. The value 1 means that the
> +                device has been isolated.
> +

You only describe 2 files here, yet your patch had 3 sysfs files.
Please always document everything.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5 3/3] crypto: hisilicon/qm - defining the device isolation strategy
  2022-07-08  7:08 ` [PATCH v5 3/3] crypto: hisilicon/qm - defining the device isolation strategy Kai Ye
@ 2022-07-08  7:35   ` Greg KH
  2022-07-21  8:14     ` yekai(A)
  0 siblings, 1 reply; 14+ messages in thread
From: Greg KH @ 2022-07-08  7:35 UTC (permalink / raw)
  To: Kai Ye; +Cc: herbert, linux-crypto, linux-kernel, wangzhou1

On Fri, Jul 08, 2022 at 03:08:20PM +0800, Kai Ye wrote:
> Define the device isolation strategy by the device driver. The
> user configures a frequency value by uacce interface. If the
> slot reset frequency exceeds the value of setting for a certain
> period of time, the device will not be available in user space.
> The time window is one hour. The VF device use the PF device
> isolation strategy. All the hardware errors are processed by PF
> driver. This solution can be used for other drivers.
> 
> Signed-off-by: Kai Ye <yekai13@huawei.com>
> ---
>  drivers/crypto/hisilicon/qm.c | 163 +++++++++++++++++++++++++++++++---
>  include/linux/hisi_acc_qm.h   |   9 ++
>  2 files changed, 160 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
> index ad83c194d664..8eb3b790a655 100644
> --- a/drivers/crypto/hisilicon/qm.c
> +++ b/drivers/crypto/hisilicon/qm.c
> @@ -417,6 +417,16 @@ struct hisi_qm_resource {
>  	struct list_head list;
>  };
>  
> +/**
> + * struct qm_hw_err - Structure describing the device errors
> + * @list: hardware error list
> + * @timestamp: timestamp when the error occurred
> + */
> +struct qm_hw_err {
> +	struct list_head list;
> +	unsigned long long timestamp;
> +};
> +
>  struct hisi_qm_hw_ops {
>  	int (*get_vft)(struct hisi_qm *qm, u32 *base, u32 *number);
>  	void (*qm_db)(struct hisi_qm *qm, u16 qn,
> @@ -3410,6 +3420,111 @@ static long hisi_qm_uacce_ioctl(struct uacce_queue *q, unsigned int cmd,
>  	return 0;
>  }
>  
> +/**
> + * qm_hw_err_isolate() - Try to isolate the uacce device with its VFs
> + * according to user's configuration of isolation strategy. Warning: this
> + * API should be called while there the users on this device are suspended
> + * by slot resetting preparation of PCI AER.
> + * @qm: the uacce device
> + */
> +static int qm_hw_err_isolate(struct hisi_qm *qm)
> +{
> +	struct qm_hw_err *err, *tmp, *hw_err;
> +	struct qm_err_isolate *isolate;
> +	u32 count = 0;
> +
> +	isolate = &qm->isolate_data;
> +
> +#define SECONDS_PER_HOUR	3600
> +
> +	/* All the hw errs are processed by PF driver */
> +	if (qm->uacce->is_vf || isolate->is_isolate ||
> +	    !isolate->hw_err_isolate_hz)
> +		return 0;
> +
> +	hw_err = kzalloc(sizeof(*hw_err), GFP_ATOMIC);

Why atomic?  What lock is held here?

> +	if (!hw_err)
> +		return -ENOMEM;
> +
> +	mutex_lock(&isolate->isolate_lock);
> +	hw_err->timestamp = jiffies;
> +	list_for_each_entry_safe(err, tmp, &isolate->uacce_hw_errs, list) {
> +		if ((hw_err->timestamp - err->timestamp) / HZ >
> +		    SECONDS_PER_HOUR) {

No possiblity of wrapping the timestamp?

> +			list_del(&err->list);
> +			kfree(err);
> +		} else {
> +			count++;
> +		}
> +	}
> +	list_add(&hw_err->list, &isolate->uacce_hw_errs);
> +	mutex_unlock(&isolate->isolate_lock);
> +
> +	if (count >= isolate->hw_err_isolate_hz)
> +		isolate->is_isolate = true;
> +
> +	return 0;
> +}
> +
> +static void qm_hw_err_destroy(struct hisi_qm *qm)
> +{
> +	struct qm_hw_err *err, *tmp;
> +
> +	mutex_lock(&qm->isolate_data.isolate_lock);
> +	list_for_each_entry_safe(err, tmp, &qm->isolate_data.uacce_hw_errs, list) {
> +		list_del(&err->list);
> +		kfree(err);
> +	}
> +	mutex_unlock(&qm->isolate_data.isolate_lock);
> +}
> +
> +static enum uacce_dev_state hisi_qm_get_isolate_state(struct uacce_device *uacce)
> +{
> +	struct hisi_qm *qm = uacce->priv;
> +	struct hisi_qm *pf_qm;
> +
> +	if (uacce->is_vf)
> +		pf_qm = pci_get_drvdata(pci_physfn(qm->pdev));
> +	else
> +		pf_qm = qm;
> +
> +	return pf_qm->isolate_data.is_isolate ?
> +			UACCE_DEV_ISOLATE : UACCE_DEV_NORMAL;
> +}
> +
> +static int hisi_qm_isolate_strategy_write(struct uacce_device *uacce,
> +					  u32 freq)
> +{
> +	struct hisi_qm *qm = uacce->priv;
> +
> +	/* Must be set by PF */
> +	if (uacce->is_vf)
> +		return -EINVAL;

But the value passed to you is not invalid, something else went wrong.
Are you sure this is the correct error?

> +
> +	if (qm->isolate_data.is_isolate)
> +		return -EINVAL;

Same here, why is this correct?

> +
> +	qm->isolate_data.hw_err_isolate_hz = freq;

No validation of the value passed to you?  It can be anything?

> +
> +	/* After the policy is updated, need to reset the hardware err list */
> +	qm_hw_err_destroy(qm);

No error checking?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5 1/3] uacce: supports device isolation feature
  2022-07-08  7:28   ` Greg KH
@ 2022-07-08  9:33     ` yekai(A)
  2022-07-08 10:01       ` Greg KH
  0 siblings, 1 reply; 14+ messages in thread
From: yekai(A) @ 2022-07-08  9:33 UTC (permalink / raw)
  To: Greg KH; +Cc: herbert, linux-crypto, linux-kernel, wangzhou1



On 2022/7/8 15:28, Greg KH wrote:
> On Fri, Jul 08, 2022 at 03:08:18PM +0800, Kai Ye wrote:
>> UACCE adds the hardware error isolation API. Users can configure
>> the isolation frequency by this sysfs node. UACCE reports the device
>> isolate state to the user space. If the AER error frequency exceeds
>> the value of setting for a certain period of time, the device will be
>> isolated.
>>
>> Signed-off-by: Kai Ye <yekai13@huawei.com>
>> ---
>>  drivers/misc/uacce/uacce.c | 55 ++++++++++++++++++++++++++++++++++++++
>>  include/linux/uacce.h      | 11 ++++++++
>>  2 files changed, 66 insertions(+)
>>
>> diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c
>> index 281c54003edc..d07b5f1f0596 100644
>> --- a/drivers/misc/uacce/uacce.c
>> +++ b/drivers/misc/uacce/uacce.c
>> @@ -7,6 +7,8 @@
>>  #include <linux/slab.h>
>>  #include <linux/uacce.h>
>>
>> +#define MAX_ERR_ISOLATE_COUNT		65535
>> +
>>  static struct class *uacce_class;
>>  static dev_t uacce_devt;
>>  static DEFINE_MUTEX(uacce_mutex);
>> @@ -339,12 +341,63 @@ static ssize_t region_dus_size_show(struct device *dev,
>>  		       uacce->qf_pg_num[UACCE_QFRT_DUS] << PAGE_SHIFT);
>>  }
>>
>> +static ssize_t isolate_show(struct device *dev,
>> +			    struct device_attribute *attr, char *buf)
>> +{
>> +	struct uacce_device *uacce = to_uacce_device(dev);
>> +
>> +	if (!uacce->ops->get_isolate_state)
>> +		return -ENODEV;
>
> If there is no callback, why is this sysfs even created at all?  Please
> do not create it if it can not be accessed.
>
> Use the is_visable() callback for the group to do this.
>

If is_visable() is used as the judgment, all uacce device nodes cannot 
be registered if there is no callback by test.

>> +
>> +	return sysfs_emit(buf, "%d\n", uacce->ops->get_isolate_state(uacce));
>> +}
>> +
>> +static ssize_t isolate_strategy_show(struct device *dev,
>> +				     struct device_attribute *attr, char *buf)
>> +{
>> +	struct uacce_device *uacce = to_uacce_device(dev);
>> +	u32 val;
>> +
>> +	if (!uacce->ops->isolate_strategy_read)
>> +		return -ENODEV;
>
> Same here, don't have a sysfs file that does nothing.
>
>> +
>> +	val = uacce->ops->isolate_strategy_read(uacce);
>> +	if (val > MAX_ERR_ISOLATE_COUNT)
>> +		return -EINVAL;
>> +
>> +	return sysfs_emit(buf, "%u\n", val);
>> +}
>> +
>> +static ssize_t isolate_strategy_store(struct device *dev,
>> +				      struct device_attribute *attr,
>> +				      const char *buf, size_t count)
>> +{
>> +	struct uacce_device *uacce = to_uacce_device(dev);
>> +	unsigned long val;
>> +	int ret;
>> +
>> +	if (!uacce->ops->isolate_strategy_write)
>> +		return -ENODEV;
>
> Same here.
>
> thanks,
>
> greg k-h
> .
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5 2/3] Documentation: add a isolation strategy sysfs node for uacce
  2022-07-08  7:30   ` Greg KH
@ 2022-07-08  9:38     ` yekai(A)
  2022-07-08 10:02       ` Greg KH
  0 siblings, 1 reply; 14+ messages in thread
From: yekai(A) @ 2022-07-08  9:38 UTC (permalink / raw)
  To: Greg KH; +Cc: herbert, linux-crypto, linux-kernel, wangzhou1



On 2022/7/8 15:30, Greg KH wrote:
> On Fri, Jul 08, 2022 at 03:08:19PM +0800, Kai Ye wrote:
>> Update documentation describing sysfs node that could help to
>> configure isolation strategy for users in the user space. And
>> describing sysfs node that could read the device isolated state.
>>
>> Signed-off-by: Kai Ye <yekai13@huawei.com>
>> ---
>>  Documentation/ABI/testing/sysfs-driver-uacce | 18 ++++++++++++++++++
>>  1 file changed, 18 insertions(+)
>>
>> diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce
>> index 08f2591138af..a8056271a963 100644
>> --- a/Documentation/ABI/testing/sysfs-driver-uacce
>> +++ b/Documentation/ABI/testing/sysfs-driver-uacce
>> @@ -19,6 +19,24 @@ Contact:        linux-accelerators@lists.ozlabs.org
>>  Description:    Available instances left of the device
>>                  Return -ENODEV if uacce_ops get_available_instances is not provided
>>
>> +What:           /sys/class/uacce/<dev_name>/isolate_strategy
>> +Date:           Jul 2022
>> +KernelVersion:  5.20
>> +Contact:        linux-accelerators@lists.ozlabs.org
>> +Description:    A sysfs node that used to configures the hardware error
>
> This is not a "node" it is just a file.
>
>
>> +                isolation strategy. This strategy is a configured integer value.
>> +                The default is 0. The maximum value is 65535. This value
>> +                indicates the number of device slot resets per unit time
>> +                that your service can tolerate.
>
> I do not understand this, sorry.  What do you mean by "that your service
> can tolerate"?

it means the user can tolerable reset frequency, because the reset will 
interrupt services.

>
>> +
>> +What:           /sys/class/uacce/<dev_name>/isolate
>> +Date:           Jul 2022
>> +KernelVersion:  5.20
>> +Contact:        linux-accelerators@lists.ozlabs.org
>> +Description:    A sysfs node that read the device isolated state. The value 0
>> +                means that the device is working. The value 1 means that the
>> +                device has been isolated.
>
> So 1 means "not working"?  This seems odd, perhaps you can rephrase this
> a bit better?

1 means the device is unavailable. 0 means the device is available.
>
> thanks,
>
> greg k-h
> .
>

Thanks

Kai

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5 1/3] uacce: supports device isolation feature
  2022-07-08  9:33     ` yekai(A)
@ 2022-07-08 10:01       ` Greg KH
  0 siblings, 0 replies; 14+ messages in thread
From: Greg KH @ 2022-07-08 10:01 UTC (permalink / raw)
  To: yekai(A); +Cc: herbert, linux-crypto, linux-kernel, wangzhou1

On Fri, Jul 08, 2022 at 05:33:42PM +0800, yekai(A) wrote:
> 
> 
> On 2022/7/8 15:28, Greg KH wrote:
> > On Fri, Jul 08, 2022 at 03:08:18PM +0800, Kai Ye wrote:
> > > UACCE adds the hardware error isolation API. Users can configure
> > > the isolation frequency by this sysfs node. UACCE reports the device
> > > isolate state to the user space. If the AER error frequency exceeds
> > > the value of setting for a certain period of time, the device will be
> > > isolated.
> > > 
> > > Signed-off-by: Kai Ye <yekai13@huawei.com>
> > > ---
> > >  drivers/misc/uacce/uacce.c | 55 ++++++++++++++++++++++++++++++++++++++
> > >  include/linux/uacce.h      | 11 ++++++++
> > >  2 files changed, 66 insertions(+)
> > > 
> > > diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c
> > > index 281c54003edc..d07b5f1f0596 100644
> > > --- a/drivers/misc/uacce/uacce.c
> > > +++ b/drivers/misc/uacce/uacce.c
> > > @@ -7,6 +7,8 @@
> > >  #include <linux/slab.h>
> > >  #include <linux/uacce.h>
> > > 
> > > +#define MAX_ERR_ISOLATE_COUNT		65535
> > > +
> > >  static struct class *uacce_class;
> > >  static dev_t uacce_devt;
> > >  static DEFINE_MUTEX(uacce_mutex);
> > > @@ -339,12 +341,63 @@ static ssize_t region_dus_size_show(struct device *dev,
> > >  		       uacce->qf_pg_num[UACCE_QFRT_DUS] << PAGE_SHIFT);
> > >  }
> > > 
> > > +static ssize_t isolate_show(struct device *dev,
> > > +			    struct device_attribute *attr, char *buf)
> > > +{
> > > +	struct uacce_device *uacce = to_uacce_device(dev);
> > > +
> > > +	if (!uacce->ops->get_isolate_state)
> > > +		return -ENODEV;
> > 
> > If there is no callback, why is this sysfs even created at all?  Please
> > do not create it if it can not be accessed.
> > 
> > Use the is_visable() callback for the group to do this.
> > 
> 
> If is_visable() is used as the judgment, all uacce device nodes cannot be
> registered if there is no callback by test.

I am sorry, I do not understand.  That callback is to be used for this
type of thing, and works for lots of driver subsystems.  Why exactly
will it not work here as well?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5 2/3] Documentation: add a isolation strategy sysfs node for uacce
  2022-07-08  9:38     ` yekai(A)
@ 2022-07-08 10:02       ` Greg KH
  0 siblings, 0 replies; 14+ messages in thread
From: Greg KH @ 2022-07-08 10:02 UTC (permalink / raw)
  To: yekai(A); +Cc: herbert, linux-crypto, linux-kernel, wangzhou1

On Fri, Jul 08, 2022 at 05:38:17PM +0800, yekai(A) wrote:
> 
> 
> On 2022/7/8 15:30, Greg KH wrote:
> > On Fri, Jul 08, 2022 at 03:08:19PM +0800, Kai Ye wrote:
> > > Update documentation describing sysfs node that could help to
> > > configure isolation strategy for users in the user space. And
> > > describing sysfs node that could read the device isolated state.
> > > 
> > > Signed-off-by: Kai Ye <yekai13@huawei.com>
> > > ---
> > >  Documentation/ABI/testing/sysfs-driver-uacce | 18 ++++++++++++++++++
> > >  1 file changed, 18 insertions(+)
> > > 
> > > diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce
> > > index 08f2591138af..a8056271a963 100644
> > > --- a/Documentation/ABI/testing/sysfs-driver-uacce
> > > +++ b/Documentation/ABI/testing/sysfs-driver-uacce
> > > @@ -19,6 +19,24 @@ Contact:        linux-accelerators@lists.ozlabs.org
> > >  Description:    Available instances left of the device
> > >                  Return -ENODEV if uacce_ops get_available_instances is not provided
> > > 
> > > +What:           /sys/class/uacce/<dev_name>/isolate_strategy
> > > +Date:           Jul 2022
> > > +KernelVersion:  5.20
> > > +Contact:        linux-accelerators@lists.ozlabs.org
> > > +Description:    A sysfs node that used to configures the hardware error
> > 
> > This is not a "node" it is just a file.
> > 
> > 
> > > +                isolation strategy. This strategy is a configured integer value.
> > > +                The default is 0. The maximum value is 65535. This value
> > > +                indicates the number of device slot resets per unit time
> > > +                that your service can tolerate.
> > 
> > I do not understand this, sorry.  What do you mean by "that your service
> > can tolerate"?
> 
> it means the user can tolerable reset frequency, because the reset will
> interrupt services.

I am sorry, I still do not understand.  Please try explaining this in
more detail in the description.

> > > +
> > > +What:           /sys/class/uacce/<dev_name>/isolate
> > > +Date:           Jul 2022
> > > +KernelVersion:  5.20
> > > +Contact:        linux-accelerators@lists.ozlabs.org
> > > +Description:    A sysfs node that read the device isolated state. The value 0
> > > +                means that the device is working. The value 1 means that the
> > > +                device has been isolated.
> > 
> > So 1 means "not working"?  This seems odd, perhaps you can rephrase this
> > a bit better?
> 
> 1 means the device is unavailable. 0 means the device is available.

Then please say that :)

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5 3/3] crypto: hisilicon/qm - defining the device isolation strategy
  2022-07-08  7:35   ` Greg KH
@ 2022-07-21  8:14     ` yekai(A)
  2022-07-23  7:21       ` yekai(A)
  0 siblings, 1 reply; 14+ messages in thread
From: yekai(A) @ 2022-07-21  8:14 UTC (permalink / raw)
  To: Greg KH; +Cc: herbert, linux-crypto, linux-kernel, wangzhou1



On 2022/7/8 15:35, Greg KH wrote:
> On Fri, Jul 08, 2022 at 03:08:20PM +0800, Kai Ye wrote:
>> Define the device isolation strategy by the device driver. The
>> user configures a frequency value by uacce interface. If the
>> slot reset frequency exceeds the value of setting for a certain
>> period of time, the device will not be available in user space.
>> The time window is one hour. The VF device use the PF device
>> isolation strategy. All the hardware errors are processed by PF
>> driver. This solution can be used for other drivers.
>>
>> Signed-off-by: Kai Ye <yekai13@huawei.com>
>> ---
>>  drivers/crypto/hisilicon/qm.c | 163 +++++++++++++++++++++++++++++++---
>>  include/linux/hisi_acc_qm.h   |   9 ++
>>  2 files changed, 160 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
>> index ad83c194d664..8eb3b790a655 100644
>> --- a/drivers/crypto/hisilicon/qm.c
>> +++ b/drivers/crypto/hisilicon/qm.c
>> @@ -417,6 +417,16 @@ struct hisi_qm_resource {
>>  	struct list_head list;
>>  };
>>
>> +/**
>> + * struct qm_hw_err - Structure describing the device errors
>> + * @list: hardware error list
>> + * @timestamp: timestamp when the error occurred
>> + */
>> +struct qm_hw_err {
>> +	struct list_head list;
>> +	unsigned long long timestamp;
>> +};
>> +
>>  struct hisi_qm_hw_ops {
>>  	int (*get_vft)(struct hisi_qm *qm, u32 *base, u32 *number);
>>  	void (*qm_db)(struct hisi_qm *qm, u16 qn,
>> @@ -3410,6 +3420,111 @@ static long hisi_qm_uacce_ioctl(struct uacce_queue *q, unsigned int cmd,
>>  	return 0;
>>  }
>>
>> +/**
>> + * qm_hw_err_isolate() - Try to isolate the uacce device with its VFs
>> + * according to user's configuration of isolation strategy. Warning: this
>> + * API should be called while there the users on this device are suspended
>> + * by slot resetting preparation of PCI AER.
>> + * @qm: the uacce device
>> + */
>> +static int qm_hw_err_isolate(struct hisi_qm *qm)
>> +{
>> +	struct qm_hw_err *err, *tmp, *hw_err;
>> +	struct qm_err_isolate *isolate;
>> +	u32 count = 0;
>> +
>> +	isolate = &qm->isolate_data;
>> +
>> +#define SECONDS_PER_HOUR	3600
>> +
>> +	/* All the hw errs are processed by PF driver */
>> +	if (qm->uacce->is_vf || isolate->is_isolate ||
>> +	    !isolate->hw_err_isolate_hz)
>> +		return 0;
>> +
>> +	hw_err = kzalloc(sizeof(*hw_err), GFP_ATOMIC);
>
> Why atomic?  What lock is held here?

Atomic is not required. So use GFP_KERNEL.
>
>> +	if (!hw_err)
>> +		return -ENOMEM;
>> +
>> +	mutex_lock(&isolate->isolate_lock);
>> +	hw_err->timestamp = jiffies;
>> +	list_for_each_entry_safe(err, tmp, &isolate->uacce_hw_errs, list) {
>> +		if ((hw_err->timestamp - err->timestamp) / HZ >
>> +		    SECONDS_PER_HOUR) {
>
> No possiblity of wrapping the timestamp?
I do not understand this suggestion, Can you show more detail in this 
suggestion?

>
>> +			list_del(&err->list);
>> +			kfree(err);
>> +		} else {
>> +			count++;
>> +		}
>> +	}
>> +	list_add(&hw_err->list, &isolate->uacce_hw_errs);
>> +	mutex_unlock(&isolate->isolate_lock);
>> +
>> +	if (count >= isolate->hw_err_isolate_hz)
>> +		isolate->is_isolate = true;
>> +
>> +	return 0;
>> +}
>> +
>> +static void qm_hw_err_destroy(struct hisi_qm *qm)
>> +{
>> +	struct qm_hw_err *err, *tmp;
>> +
>> +	mutex_lock(&qm->isolate_data.isolate_lock);
>> +	list_for_each_entry_safe(err, tmp, &qm->isolate_data.uacce_hw_errs, list) {
>> +		list_del(&err->list);
>> +		kfree(err);
>> +	}
>> +	mutex_unlock(&qm->isolate_data.isolate_lock);
>> +}
>> +
>> +static enum uacce_dev_state hisi_qm_get_isolate_state(struct uacce_device *uacce)
>> +{
>> +	struct hisi_qm *qm = uacce->priv;
>> +	struct hisi_qm *pf_qm;
>> +
>> +	if (uacce->is_vf)
>> +		pf_qm = pci_get_drvdata(pci_physfn(qm->pdev));
>> +	else
>> +		pf_qm = qm;
>> +
>> +	return pf_qm->isolate_data.is_isolate ?
>> +			UACCE_DEV_ISOLATE : UACCE_DEV_NORMAL;
>> +}
>> +
>> +static int hisi_qm_isolate_strategy_write(struct uacce_device *uacce,
>> +					  u32 freq)
>> +{
>> +	struct hisi_qm *qm = uacce->priv;
>> +
>> +	/* Must be set by PF */
>> +	if (uacce->is_vf)
>> +		return -EINVAL;
>
> But the value passed to you is not invalid, something else went wrong.
> Are you sure this is the correct error?
use EPERM instead of EINVAL.
>
>> +
>> +	if (qm->isolate_data.is_isolate)
>> +		return -EINVAL;
>
> Same here, why is this correct?
use EPERM instead of EINVAL.
>
>> +
>> +	qm->isolate_data.hw_err_isolate_hz = freq;
>
> No validation of the value passed to you?  It can be anything?
>
>> +
>> +	/* After the policy is updated, need to reset the hardware err list */
>> +	qm_hw_err_destroy(qm);
>
> No error checking?
Due to the process is clean list. So no error checking is required.
>
> thanks,
>
> greg k-h
> .
>

Thanks

Kai

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5 3/3] crypto: hisilicon/qm - defining the device isolation strategy
  2022-07-21  8:14     ` yekai(A)
@ 2022-07-23  7:21       ` yekai(A)
  0 siblings, 0 replies; 14+ messages in thread
From: yekai(A) @ 2022-07-23  7:21 UTC (permalink / raw)
  To: Greg KH; +Cc: herbert, linux-crypto, linux-kernel, wangzhou1



On 2022/7/21 16:14, yekai(A) wrote:
>
>
> On 2022/7/8 15:35, Greg KH wrote:
>> On Fri, Jul 08, 2022 at 03:08:20PM +0800, Kai Ye wrote:
>>> Define the device isolation strategy by the device driver. The
>>> user configures a frequency value by uacce interface. If the
>>> slot reset frequency exceeds the value of setting for a certain
>>> period of time, the device will not be available in user space.
>>> The time window is one hour. The VF device use the PF device
>>> isolation strategy. All the hardware errors are processed by PF
>>> driver. This solution can be used for other drivers.
>>>
>>> Signed-off-by: Kai Ye <yekai13@huawei.com>
>>> ---
>>>  drivers/crypto/hisilicon/qm.c | 163 +++++++++++++++++++++++++++++++---
>>>  include/linux/hisi_acc_qm.h   |   9 ++
>>>  2 files changed, 160 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/drivers/crypto/hisilicon/qm.c
>>> b/drivers/crypto/hisilicon/qm.c
>>> index ad83c194d664..8eb3b790a655 100644
>>> --- a/drivers/crypto/hisilicon/qm.c
>>> +++ b/drivers/crypto/hisilicon/qm.c
>>> @@ -417,6 +417,16 @@ struct hisi_qm_resource {
>>>      struct list_head list;
>>>  };
>>>
>>> +/**
>>> + * struct qm_hw_err - Structure describing the device errors
>>> + * @list: hardware error list
>>> + * @timestamp: timestamp when the error occurred
>>> + */
>>> +struct qm_hw_err {
>>> +    struct list_head list;
>>> +    unsigned long long timestamp;
>>> +};
>>> +
>>>  struct hisi_qm_hw_ops {
>>>      int (*get_vft)(struct hisi_qm *qm, u32 *base, u32 *number);
>>>      void (*qm_db)(struct hisi_qm *qm, u16 qn,
>>> @@ -3410,6 +3420,111 @@ static long hisi_qm_uacce_ioctl(struct
>>> uacce_queue *q, unsigned int cmd,
>>>      return 0;
>>>  }
>>>
>>> +/**
>>> + * qm_hw_err_isolate() - Try to isolate the uacce device with its VFs
>>> + * according to user's configuration of isolation strategy. Warning:
>>> this
>>> + * API should be called while there the users on this device are
>>> suspended
>>> + * by slot resetting preparation of PCI AER.
>>> + * @qm: the uacce device
>>> + */
>>> +static int qm_hw_err_isolate(struct hisi_qm *qm)
>>> +{
>>> +    struct qm_hw_err *err, *tmp, *hw_err;
>>> +    struct qm_err_isolate *isolate;
>>> +    u32 count = 0;
>>> +
>>> +    isolate = &qm->isolate_data;
>>> +
>>> +#define SECONDS_PER_HOUR    3600
>>> +
>>> +    /* All the hw errs are processed by PF driver */
>>> +    if (qm->uacce->is_vf || isolate->is_isolate ||
>>> +        !isolate->hw_err_isolate_hz)
>>> +        return 0;
>>> +
>>> +    hw_err = kzalloc(sizeof(*hw_err), GFP_ATOMIC);
>>
>> Why atomic?  What lock is held here?
>
> Atomic is not required. So use GFP_KERNEL.
>>
>>> +    if (!hw_err)
>>> +        return -ENOMEM;
>>> +
>>> +    mutex_lock(&isolate->isolate_lock);
>>> +    hw_err->timestamp = jiffies;
>>> +    list_for_each_entry_safe(err, tmp, &isolate->uacce_hw_errs, list) {
>>> +        if ((hw_err->timestamp - err->timestamp) / HZ >
>>> +            SECONDS_PER_HOUR) {
>>
>> No possiblity of wrapping the timestamp?
> I do not understand this suggestion, Can you show more detail in this
> suggestion?
>
>>
>>> +            list_del(&err->list);
>>> +            kfree(err);
>>> +        } else {
>>> +            count++;
>>> +        }
>>> +    }
>>> +    list_add(&hw_err->list, &isolate->uacce_hw_errs);
>>> +    mutex_unlock(&isolate->isolate_lock);
>>> +
>>> +    if (count >= isolate->hw_err_isolate_hz)
>>> +        isolate->is_isolate = true;
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static void qm_hw_err_destroy(struct hisi_qm *qm)
>>> +{
>>> +    struct qm_hw_err *err, *tmp;
>>> +
>>> +    mutex_lock(&qm->isolate_data.isolate_lock);
>>> +    list_for_each_entry_safe(err, tmp,
>>> &qm->isolate_data.uacce_hw_errs, list) {
>>> +        list_del(&err->list);
>>> +        kfree(err);
>>> +    }
>>> +    mutex_unlock(&qm->isolate_data.isolate_lock);
>>> +}
>>> +
>>> +static enum uacce_dev_state hisi_qm_get_isolate_state(struct
>>> uacce_device *uacce)
>>> +{
>>> +    struct hisi_qm *qm = uacce->priv;
>>> +    struct hisi_qm *pf_qm;
>>> +
>>> +    if (uacce->is_vf)
>>> +        pf_qm = pci_get_drvdata(pci_physfn(qm->pdev));
>>> +    else
>>> +        pf_qm = qm;
>>> +
>>> +    return pf_qm->isolate_data.is_isolate ?
>>> +            UACCE_DEV_ISOLATE : UACCE_DEV_NORMAL;
>>> +}
>>> +
>>> +static int hisi_qm_isolate_strategy_write(struct uacce_device *uacce,
>>> +                      u32 freq)
>>> +{
>>> +    struct hisi_qm *qm = uacce->priv;
>>> +
>>> +    /* Must be set by PF */
>>> +    if (uacce->is_vf)
>>> +        return -EINVAL;
>>
>> But the value passed to you is not invalid, something else went wrong.
>> Are you sure this is the correct error?
> use EPERM instead of EINVAL.
>>
>>> +
>>> +    if (qm->isolate_data.is_isolate)
>>> +        return -EINVAL;
>>
>> Same here, why is this correct?
> use EPERM instead of EINVAL.
>>
>>> +
>>> +    qm->isolate_data.hw_err_isolate_hz = freq;
>>
>> No validation of the value passed to you?  It can be anything?
The range has been verified by the UACCE. So do not need to check again.

>>
>>> +
>>> +    /* After the policy is updated, need to reset the hardware err
>>> list */
>>> +    qm_hw_err_destroy(qm);
>>
>> No error checking?
> Due to the process is clean list. So no error checking is required.
>>
>> thanks,
>>
>> greg k-h
>> .
>>
>
> Thanks
>
> Kai
> .
>

Thanks

Kai

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2022-07-23  7:23 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-08  7:08 [PATCH v5 0/3] crypto: hisilicon - supports device isolation feature Kai Ye
2022-07-08  7:08 ` [PATCH v5 1/3] uacce: " Kai Ye
2022-07-08  7:28   ` Greg KH
2022-07-08  9:33     ` yekai(A)
2022-07-08 10:01       ` Greg KH
2022-07-08  7:08 ` [PATCH v5 2/3] Documentation: add a isolation strategy sysfs node for uacce Kai Ye
2022-07-08  7:30   ` Greg KH
2022-07-08  9:38     ` yekai(A)
2022-07-08 10:02       ` Greg KH
2022-07-08  7:30   ` Greg KH
2022-07-08  7:08 ` [PATCH v5 3/3] crypto: hisilicon/qm - defining the device isolation strategy Kai Ye
2022-07-08  7:35   ` Greg KH
2022-07-21  8:14     ` yekai(A)
2022-07-23  7:21       ` yekai(A)

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.