linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v9 0/3] crypto: hisilicon - supports device isolation feature
@ 2022-10-25 12:39 Kai Ye
  2022-10-25 12:39 ` [PATCH v9 1/3] uacce: " Kai Ye
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Kai Ye @ 2022-10-25 12:39 UTC (permalink / raw)
  To: gregkh, linux-accelerators, linux-kernel, linuxarm, zhangfei.gao,
	wangzhou1, yekai13

1.Add the uacce hardware error isolation interface. Supports
  configures the hardware error isolation frequency.
2.Defining the isolation strategy for ACC by uacce sysfs node. If the
  number of hardware errors in a per hour exceeds the configured value,
  the device will not be available in user space. The VF device use the
  PF device isolation strategy.

changes v1->v2:
        - deleted dev_to_uacce api.
        - add vfs node doc.
        - move uacce->ref to driver.
changes v2->v3:
        - deleted some redundant code.
        - use qm state instead of reference count.
        - add null pointer check.
        - isolate_strategy_read() instead of a copy.
changes v3->v4:
        - modify a comment
changes v4->v5:
        - use bool instead of atomic.
        - isolation frequency instead of isolation command.
changes v5->v6:
        - add is_visible in uacce.
        - add the description of the isolation strategy file node.
changes v6->v7
        - add an example for isolate_strategy in Documentation.
changes v7->v8
        - update the correct date.
changes v8->v9
        - move isolation strategy from qm to uacce.

Kai Ye (3):
  uacce: supports device isolation feature
  Documentation: add a isolation strategy sysfs node for uacce
  crypto: hisilicon/qm - add the device isolation feature for acc

 Documentation/ABI/testing/sysfs-driver-uacce |  27 ++++
 drivers/crypto/hisilicon/qm.c                |  66 +++++++--
 drivers/misc/uacce/uacce.c                   | 145 +++++++++++++++++++
 include/linux/uacce.h                        |  43 +++++-
 4 files changed, 271 insertions(+), 10 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v9 1/3] uacce: supports device isolation feature
  2022-10-25 12:39 [PATCH v9 0/3] crypto: hisilicon - supports device isolation feature Kai Ye
@ 2022-10-25 12:39 ` Kai Ye
  2022-10-25 13:00   ` Greg KH
  2022-10-26  6:14   ` kernel test robot
  2022-10-25 12:39 ` [PATCH v9 2/3] Documentation: add a isolation strategy sysfs node for uacce Kai Ye
  2022-10-25 12:39 ` [PATCH v9 3/3] crypto: hisilicon/qm - add the device isolation feature for acc Kai Ye
  2 siblings, 2 replies; 7+ messages in thread
From: Kai Ye @ 2022-10-25 12:39 UTC (permalink / raw)
  To: gregkh, linux-accelerators, linux-kernel, linuxarm, zhangfei.gao,
	wangzhou1, yekai13

UACCE adds the hardware error isolation API. Users can configure
the isolation frequency by this sysfs node. UACCE reports the device
isolate state to the user space. If the AER error frequency exceeds
the set value in one hour, the device will be isolated.

Signed-off-by: Kai Ye <yekai13@huawei.com>
---
 drivers/misc/uacce/uacce.c | 145 +++++++++++++++++++++++++++++++++++++
 include/linux/uacce.h      |  43 ++++++++++-
 2 files changed, 187 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c
index b70a013139c7..f293fcdcf44f 100644
--- a/drivers/misc/uacce/uacce.c
+++ b/drivers/misc/uacce/uacce.c
@@ -7,10 +7,100 @@
 #include <linux/slab.h>
 #include <linux/uacce.h>
 
+#define MAX_ERR_ISOLATE_COUNT	65535
+
 static struct class *uacce_class;
 static dev_t uacce_devt;
 static DEFINE_XARRAY_ALLOC(uacce_xa);
 
+static int cdev_get(struct device *dev, void *data)
+{
+	struct uacce_device *uacce;
+	struct device **t_dev = data;
+
+	uacce = container_of(dev, struct uacce_device, dev);
+	if (uacce->parent == *t_dev) {
+		*t_dev = dev;
+		return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * dev_to_uacce - Get structure uacce device from its parent device
+ * @dev the device
+ */
+struct uacce_device *dev_to_uacce(struct device *dev)
+{
+	struct device **tdev = &dev;
+	int ret;
+
+	ret = class_for_each_device(uacce_class, NULL, tdev, cdev_get);
+	if (ret) {
+		dev = *tdev;
+		return container_of(dev, struct uacce_device, dev);
+	}
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(dev_to_uacce);
+
+/**
+ * uacce_hw_err_isolate - Try to set the isolation status of the uacce device
+ * according to user's configuration of isolation strategy.
+ * @uacce the uacce device
+ */
+int uacce_hw_err_isolate(struct uacce_device *uacce)
+{
+	struct uacce_hw_err *err, *tmp, *hw_err;
+	struct uacce_err_isolate *isolate_ctx;
+	u32 count = 0;
+
+	if (!uacce)
+		return -EINVAL;
+
+	isolate_ctx = uacce->isolate_ctx;
+
+#define SECONDS_PER_HOUR	3600
+
+	/* All the hw errs are processed by PF driver */
+	if (uacce->is_vf || isolate_ctx->is_isolate ||
+		!isolate_ctx->hw_err_isolate_hz)
+		return 0;
+
+	hw_err = kzalloc(sizeof(*hw_err), GFP_KERNEL);
+	if (!hw_err)
+		return -ENOMEM;
+
+	hw_err->timestamp = jiffies;
+	list_for_each_entry_safe(err, tmp, &isolate_ctx->hw_errs, list) {
+		if ((hw_err->timestamp - err->timestamp) / HZ >
+		    SECONDS_PER_HOUR) {
+			list_del(&err->list);
+			kfree(err);
+		} else {
+			count++;
+		}
+	}
+	list_add(&hw_err->list, &isolate_ctx->hw_errs);
+
+	if (count >= isolate_ctx->hw_err_isolate_hz)
+		isolate_ctx->is_isolate = true;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(uacce_hw_err_isolate);
+
+static void uacce_hw_err_destroy(struct uacce_device *uacce)
+{
+	struct uacce_hw_err *err, *tmp;
+
+	list_for_each_entry_safe(err, tmp, &uacce->isolate_data.hw_errs, list) {
+		list_del(&err->list);
+		kfree(err);
+	}
+}
+
 /*
  * If the parent driver or the device disappears, the queue state is invalid and
  * ops are not usable anymore.
@@ -363,12 +453,59 @@ static ssize_t region_dus_size_show(struct device *dev,
 		       uacce->qf_pg_num[UACCE_QFRT_DUS] << PAGE_SHIFT);
 }
 
+static ssize_t isolate_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+	int ret = UACCE_DEV_NORMAL;
+
+	if (uacce->isolate_ctx->is_isolate)
+		ret = UACCE_DEV_ISOLATE;
+
+	return sysfs_emit(buf, "%d\n", ret);
+}
+
+static ssize_t isolate_strategy_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+
+	return sysfs_emit(buf, "%u\n", uacce->isolate_ctx->hw_err_isolate_hz);
+}
+
+static ssize_t isolate_strategy_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+	unsigned long val;
+
+	/* must be set by PF */
+	if (uacce->is_vf)
+		return -EPERM;
+
+	if (kstrtoul(buf, 0, &val) < 0)
+		return -EINVAL;
+
+	if (val > MAX_ERR_ISOLATE_COUNT)
+		return -EINVAL;
+
+	uacce->isolate_ctx->hw_err_isolate_hz = val;
+
+	/* After the policy is updated, need to reset the hardware err list */
+	uacce_hw_err_destroy(uacce);
+
+	return count;
+}
+
 static DEVICE_ATTR_RO(api);
 static DEVICE_ATTR_RO(flags);
 static DEVICE_ATTR_RO(available_instances);
 static DEVICE_ATTR_RO(algorithms);
 static DEVICE_ATTR_RO(region_mmio_size);
 static DEVICE_ATTR_RO(region_dus_size);
+static DEVICE_ATTR_RO(isolate);
+static DEVICE_ATTR_RW(isolate_strategy);
 
 static struct attribute *uacce_dev_attrs[] = {
 	&dev_attr_api.attr,
@@ -377,6 +514,8 @@ static struct attribute *uacce_dev_attrs[] = {
 	&dev_attr_algorithms.attr,
 	&dev_attr_region_mmio_size.attr,
 	&dev_attr_region_dus_size.attr,
+	&dev_attr_isolate.attr,
+	&dev_attr_isolate_strategy.attr,
 	NULL,
 };
 
@@ -392,6 +531,9 @@ static umode_t uacce_dev_is_visible(struct kobject *kobj,
 	    (!uacce->qf_pg_num[UACCE_QFRT_DUS])))
 		return 0;
 
+	if (attr == &dev_attr_isolate_strategy.attr && !uacce->isolate_ctx)
+		return 0;
+
 	return attr->mode;
 }
 
@@ -474,6 +616,7 @@ struct uacce_device *uacce_alloc(struct device *parent,
 		goto err_with_uacce;
 
 	INIT_LIST_HEAD(&uacce->queues);
+	INIT_LIST_HEAD(&uacce->isolate_data.hw_errs);
 	mutex_init(&uacce->mutex);
 	device_initialize(&uacce->dev);
 	uacce->dev.devt = MKDEV(MAJOR(uacce_devt), uacce->dev_id);
@@ -555,6 +698,8 @@ void uacce_remove(struct uacce_device *uacce)
 	if (uacce->cdev)
 		cdev_device_del(uacce->cdev, &uacce->dev);
 	xa_erase(&uacce_xa, uacce->dev_id);
+
+	uacce_hw_err_destroy(uacce);
 	/*
 	 * uacce exists as long as there are open fds, but ops will be freed
 	 * now. Ensure that bugs cause NULL deref rather than use-after-free.
diff --git a/include/linux/uacce.h b/include/linux/uacce.h
index 9ce88c28b0a8..c8eecaf7b16d 100644
--- a/include/linux/uacce.h
+++ b/include/linux/uacce.h
@@ -12,6 +12,28 @@
 struct uacce_queue;
 struct uacce_device;
 
+/**
+ * struct uacce_hw_err - Structure describing the device errors
+ * @list: hardware error log node
+ * @timestamp: timestamp when the error occurred
+ */
+struct uacce_hw_err {
+	struct list_head list;
+	unsigned long long timestamp;
+};
+
+/**
+ * struct uacce_err_isolate - Structure describing the isolation data
+ * @hw_err_isolate_hz: user cfg freq which triggers isolation
+ * @is_isolate: device isolate state
+ * @hw_errs: uacce hardware error list
+ */
+struct uacce_err_isolate {
+	u32 hw_err_isolate_hz;
+	bool is_isolate;
+	struct list_head hw_errs;
+};
+
 /**
  * struct uacce_qfile_region - structure of queue file region
  * @type: type of the region
@@ -57,6 +79,11 @@ struct uacce_interface {
 	const struct uacce_ops *ops;
 };
 
+enum uacce_dev_state {
+	UACCE_DEV_NORMAL,
+	UACCE_DEV_ISOLATE,
+};
+
 enum uacce_q_state {
 	UACCE_Q_ZOMBIE = 0,
 	UACCE_Q_INIT,
@@ -101,6 +128,8 @@ struct uacce_queue {
  * @dev: dev of the uacce
  * @mutex: protects uacce operation
  * @priv: private pointer of the uacce
+ * @isolate_data: device isolation data about pf and vf device
+ * @isolate_ctx: isolation ctx about current char device
  * @queues: list of queues
  * @inode: core vfs
  */
@@ -117,6 +146,8 @@ struct uacce_device {
 	struct device dev;
 	struct mutex mutex;
 	void *priv;
+	struct uacce_err_isolate isolate_data;
+	struct uacce_err_isolate *isolate_ctx;
 	struct list_head queues;
 	struct inode *inode;
 };
@@ -127,7 +158,8 @@ struct uacce_device *uacce_alloc(struct device *parent,
 				 struct uacce_interface *interface);
 int uacce_register(struct uacce_device *uacce);
 void uacce_remove(struct uacce_device *uacce);
-
+struct uacce_device *dev_to_uacce(struct device *dev);
+int uacce_hw_err_isolate(struct uacce_device *uacce);
 #else /* CONFIG_UACCE */
 
 static inline
@@ -144,6 +176,15 @@ static inline int uacce_register(struct uacce_device *uacce)
 
 static inline void uacce_remove(struct uacce_device *uacce) {}
 
+static inline struct uacce_device *dev_to_uacce(struct device *dev)
+{
+	return NULL;
+}
+
+int uacce_hw_err_isolate(struct uacce_device *uacce)
+{
+	return -EINVAL;
+}
 #endif /* CONFIG_UACCE */
 
 #endif /* _LINUX_UACCE_H */
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v9 2/3] Documentation: add a isolation strategy sysfs node for uacce
  2022-10-25 12:39 [PATCH v9 0/3] crypto: hisilicon - supports device isolation feature Kai Ye
  2022-10-25 12:39 ` [PATCH v9 1/3] uacce: " Kai Ye
@ 2022-10-25 12:39 ` Kai Ye
  2022-10-25 13:03   ` Greg KH
  2022-10-25 12:39 ` [PATCH v9 3/3] crypto: hisilicon/qm - add the device isolation feature for acc Kai Ye
  2 siblings, 1 reply; 7+ messages in thread
From: Kai Ye @ 2022-10-25 12:39 UTC (permalink / raw)
  To: gregkh, linux-accelerators, linux-kernel, linuxarm, zhangfei.gao,
	wangzhou1, yekai13

Update documentation describing sysfs node that could help to
configure isolation strategy for users in the user space. And
describing sysfs node that could read the device isolated state.

Signed-off-by: Kai Ye <yekai13@huawei.com>
---
 Documentation/ABI/testing/sysfs-driver-uacce | 27 ++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce
index 08f2591138af..50737c897ba3 100644
--- a/Documentation/ABI/testing/sysfs-driver-uacce
+++ b/Documentation/ABI/testing/sysfs-driver-uacce
@@ -19,6 +19,33 @@ Contact:        linux-accelerators@lists.ozlabs.org
 Description:    Available instances left of the device
                 Return -ENODEV if uacce_ops get_available_instances is not provided
 
+What:           /sys/class/uacce/<dev_name>/isolate_strategy
+Date:           Oct 2022
+KernelVersion:  6.1
+Contact:        linux-accelerators@lists.ozlabs.org
+Description:    (RW) Configure the frequency size for the hardware error
+                isolation strategy. This unit is the number of times. Number
+                of occurrences in a period, also means threshold. If the number
+                of device pci AER error exceeds the threshold in a time window,
+                the device is isolated. This size is a configured integer value.
+                The default is 0. The maximum value is 65535.
+
+                In the hisilicon accelerator engine, first we will
+                time-stamp every slot AER error. Then check the AER error log
+                when the device AER error occurred. if the device slot AER error
+                count exceeds the preset the number of times in one hour, the
+                isolated state will be set to true. So the device will be
+                isolated. And the AER error log that exceed one hour will be
+                cleared.
+
+What:           /sys/class/uacce/<dev_name>/isolate
+Date:           Oct 2022
+KernelVersion:  6.1
+Contact:        linux-accelerators@lists.ozlabs.org
+Description:    (R) A sysfs node that read the device isolated state. The value 1
+                means the device is unavailable. The 0 means the device is
+                available.
+
 What:           /sys/class/uacce/<dev_name>/algorithms
 Date:           Feb 2020
 KernelVersion:  5.7
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v9 3/3] crypto: hisilicon/qm - add the device isolation feature for acc
  2022-10-25 12:39 [PATCH v9 0/3] crypto: hisilicon - supports device isolation feature Kai Ye
  2022-10-25 12:39 ` [PATCH v9 1/3] uacce: " Kai Ye
  2022-10-25 12:39 ` [PATCH v9 2/3] Documentation: add a isolation strategy sysfs node for uacce Kai Ye
@ 2022-10-25 12:39 ` Kai Ye
  2 siblings, 0 replies; 7+ messages in thread
From: Kai Ye @ 2022-10-25 12:39 UTC (permalink / raw)
  To: gregkh, linux-accelerators, linux-kernel, linuxarm, zhangfei.gao,
	wangzhou1, yekai13

Record every AER error by uacce api. And isolate the device directly
when the controller reset fail. The VF device use the PF device
isolation strategy. Once the PF device is isolated, its VF device will
also be isolated.

Signed-off-by: Kai Ye <yekai13@huawei.com>
---
 drivers/crypto/hisilicon/qm.c | 66 ++++++++++++++++++++++++++++++-----
 1 file changed, 57 insertions(+), 9 deletions(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 363a02810a16..aa953ce86f70 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -3397,6 +3397,29 @@ static void qm_set_sqctype(struct uacce_queue *q, u16 type)
 	up_write(&qm->qps_lock);
 }
 
+static int qm_uacce_isolate_init(struct hisi_qm *qm)
+{
+	struct pci_dev *pdev = qm->pdev;
+	struct uacce_device *pf_uacce, *uacce;
+	struct device *pf_dev = &(pci_physfn(pdev)->dev);
+
+	uacce = qm->uacce;
+	if (uacce->is_vf) {
+		/* VF uses PF's isoalte data */
+		pf_uacce = dev_to_uacce(pf_dev);
+		if (!pf_uacce) {
+			pci_err(pdev, "fail to PF device!\n");
+			return -ENODEV;
+		}
+
+		uacce->isolate_ctx = &pf_uacce->isolate_data;
+	} else {
+		uacce->isolate_ctx = &uacce->isolate_data;
+	}
+
+	return 0;
+}
+
 static long hisi_qm_uacce_ioctl(struct uacce_queue *q, unsigned int cmd,
 				unsigned long arg)
 {
@@ -3450,6 +3473,14 @@ static const struct uacce_ops uacce_qm_ops = {
 	.is_q_updated = hisi_qm_is_q_updated,
 };
 
+static void qm_remove_uacce(struct hisi_qm *qm)
+{
+	if (qm->use_sva) {
+		uacce_remove(qm->uacce);
+		qm->uacce = NULL;
+	}
+}
+
 static int qm_alloc_uacce(struct hisi_qm *qm)
 {
 	struct pci_dev *pdev = qm->pdev;
@@ -3511,7 +3542,14 @@ static int qm_alloc_uacce(struct hisi_qm *qm)
 
 	qm->uacce = uacce;
 
+	ret = qm_uacce_isolate_init(qm);
+	if (ret)
+		goto err_rm_uacce;
+
 	return 0;
+err_rm_uacce:
+	qm_remove_uacce(qm);
+	return ret;
 }
 
 /**
@@ -5133,6 +5171,12 @@ static int qm_controller_reset_prepare(struct hisi_qm *qm)
 		return ret;
 	}
 
+	if (qm->use_sva) {
+		ret = uacce_hw_err_isolate(qm->uacce);
+		if (ret)
+			pci_err(pdev, "failed to isolate hw err!\n");
+	}
+
 	ret = qm_wait_vf_prepare_finish(qm);
 	if (ret)
 		pci_err(pdev, "failed to stop by vfs in soft reset!\n");
@@ -5458,21 +5502,25 @@ static int qm_controller_reset(struct hisi_qm *qm)
 		qm->err_ini->show_last_dfx_regs(qm);
 
 	ret = qm_soft_reset(qm);
-	if (ret) {
-		pci_err(pdev, "Controller reset failed (%d)\n", ret);
-		qm_reset_bit_clear(qm);
-		return ret;
-	}
+	if (ret)
+		goto err_reset;
 
 	ret = qm_controller_reset_done(qm);
-	if (ret) {
-		qm_reset_bit_clear(qm);
-		return ret;
-	}
+	if (ret)
+		goto err_reset;
 
 	pci_info(pdev, "Controller reset complete\n");
 
 	return 0;
+err_reset:
+	pci_info(pdev, "Controller reset failed (%d)\n", ret);
+	qm_reset_bit_clear(qm);
+
+	/* if resetting fails, isolate the device */
+	if (qm->use_sva && !qm->uacce->is_vf)
+		qm->uacce->isolate_ctx->is_isolate = true;
+
+	return ret;
 }
 
 /**
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v9 1/3] uacce: supports device isolation feature
  2022-10-25 12:39 ` [PATCH v9 1/3] uacce: " Kai Ye
@ 2022-10-25 13:00   ` Greg KH
  2022-10-26  6:14   ` kernel test robot
  1 sibling, 0 replies; 7+ messages in thread
From: Greg KH @ 2022-10-25 13:00 UTC (permalink / raw)
  To: Kai Ye
  Cc: linux-accelerators, linux-kernel, linuxarm, zhangfei.gao, wangzhou1

On Tue, Oct 25, 2022 at 12:39:29PM +0000, Kai Ye wrote:
> UACCE adds the hardware error isolation API. Users can configure
> the isolation frequency by this sysfs node. UACCE reports the device
> isolate state to the user space. If the AER error frequency exceeds
> the set value in one hour, the device will be isolated.
> 

You are trying to "reach across" to different types of devices here,
why?  That feels backwards.  Why isn't the device that needs to use this
api just create a child class device of this type?



> Signed-off-by: Kai Ye <yekai13@huawei.com>
> ---
>  drivers/misc/uacce/uacce.c | 145 +++++++++++++++++++++++++++++++++++++
>  include/linux/uacce.h      |  43 ++++++++++-
>  2 files changed, 187 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c
> index b70a013139c7..f293fcdcf44f 100644
> --- a/drivers/misc/uacce/uacce.c
> +++ b/drivers/misc/uacce/uacce.c
> @@ -7,10 +7,100 @@
>  #include <linux/slab.h>
>  #include <linux/uacce.h>
>  
> +#define MAX_ERR_ISOLATE_COUNT	65535
> +
>  static struct class *uacce_class;
>  static dev_t uacce_devt;
>  static DEFINE_XARRAY_ALLOC(uacce_xa);
>  
> +static int cdev_get(struct device *dev, void *data)

That's a very odd function, considering it does not "get" anything.

And it does not work on cdev structures.

> +{
> +	struct uacce_device *uacce;
> +	struct device **t_dev = data;

Why are you having to cast this?  Why not make it a real pointer?

> +
> +	uacce = container_of(dev, struct uacce_device, dev);
> +	if (uacce->parent == *t_dev) {
> +		*t_dev = dev;
> +		return 1;

bool?

And what happened to the reference count here?

> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * dev_to_uacce - Get structure uacce device from its parent device
> + * @dev the device
> + */
> +struct uacce_device *dev_to_uacce(struct device *dev)
> +{
> +	struct device **tdev = &dev;
> +	int ret;
> +
> +	ret = class_for_each_device(uacce_class, NULL, tdev, cdev_get);

Ah, you use it here.

No, sorry, you can not just walk all devices in the tree and assume this
will work.

Why do you not already know the device already?

> +	if (ret) {
> +		dev = *tdev;
> +		return container_of(dev, struct uacce_device, dev);
> +	}
> +	return NULL;
> +}
> +EXPORT_SYMBOL_GPL(dev_to_uacce);

Where is the reference counting here?

And again, horrible global function name :(

> +
> +/**
> + * uacce_hw_err_isolate - Try to set the isolation status of the uacce device
> + * according to user's configuration of isolation strategy.
> + * @uacce the uacce device
> + */
> +int uacce_hw_err_isolate(struct uacce_device *uacce)
> +{
> +	struct uacce_hw_err *err, *tmp, *hw_err;
> +	struct uacce_err_isolate *isolate_ctx;
> +	u32 count = 0;
> +
> +	if (!uacce)
> +		return -EINVAL;

How can this happen?

> +
> +	isolate_ctx = uacce->isolate_ctx;
> +
> +#define SECONDS_PER_HOUR	3600

We don't already have this in a header file?

> +
> +	/* All the hw errs are processed by PF driver */
> +	if (uacce->is_vf || isolate_ctx->is_isolate ||
> +		!isolate_ctx->hw_err_isolate_hz)
> +		return 0;
> +
> +	hw_err = kzalloc(sizeof(*hw_err), GFP_KERNEL);
> +	if (!hw_err)
> +		return -ENOMEM;
> +
> +	hw_err->timestamp = jiffies;
> +	list_for_each_entry_safe(err, tmp, &isolate_ctx->hw_errs, list) {
> +		if ((hw_err->timestamp - err->timestamp) / HZ >
> +		    SECONDS_PER_HOUR) {
> +			list_del(&err->list);
> +			kfree(err);
> +		} else {
> +			count++;
> +		}
> +	}
> +	list_add(&hw_err->list, &isolate_ctx->hw_errs);
> +
> +	if (count >= isolate_ctx->hw_err_isolate_hz)
> +		isolate_ctx->is_isolate = true;
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(uacce_hw_err_isolate);
> +
> +static void uacce_hw_err_destroy(struct uacce_device *uacce)
> +{
> +	struct uacce_hw_err *err, *tmp;
> +
> +	list_for_each_entry_safe(err, tmp, &uacce->isolate_data.hw_errs, list) {
> +		list_del(&err->list);
> +		kfree(err);

No reference counting at all?

> +	}
> +}
> +
>  /*
>   * If the parent driver or the device disappears, the queue state is invalid and
>   * ops are not usable anymore.
> @@ -363,12 +453,59 @@ static ssize_t region_dus_size_show(struct device *dev,
>  		       uacce->qf_pg_num[UACCE_QFRT_DUS] << PAGE_SHIFT);
>  }
>  
> +static ssize_t isolate_show(struct device *dev,
> +			    struct device_attribute *attr, char *buf)
> +{
> +	struct uacce_device *uacce = to_uacce_device(dev);
> +	int ret = UACCE_DEV_NORMAL;
> +
> +	if (uacce->isolate_ctx->is_isolate)
> +		ret = UACCE_DEV_ISOLATE;
> +
> +	return sysfs_emit(buf, "%d\n", ret);
> +}
> +
> +static ssize_t isolate_strategy_show(struct device *dev,
> +				     struct device_attribute *attr, char *buf)
> +{
> +	struct uacce_device *uacce = to_uacce_device(dev);
> +
> +	return sysfs_emit(buf, "%u\n", uacce->isolate_ctx->hw_err_isolate_hz);
> +}
> +
> +static ssize_t isolate_strategy_store(struct device *dev,
> +				      struct device_attribute *attr,
> +				      const char *buf, size_t count)
> +{
> +	struct uacce_device *uacce = to_uacce_device(dev);
> +	unsigned long val;
> +
> +	/* must be set by PF */
> +	if (uacce->is_vf)
> +		return -EPERM;
> +
> +	if (kstrtoul(buf, 0, &val) < 0)
> +		return -EINVAL;
> +
> +	if (val > MAX_ERR_ISOLATE_COUNT)
> +		return -EINVAL;
> +
> +	uacce->isolate_ctx->hw_err_isolate_hz = val;
> +
> +	/* After the policy is updated, need to reset the hardware err list */
> +	uacce_hw_err_destroy(uacce);
> +
> +	return count;
> +}
> +
>  static DEVICE_ATTR_RO(api);
>  static DEVICE_ATTR_RO(flags);
>  static DEVICE_ATTR_RO(available_instances);
>  static DEVICE_ATTR_RO(algorithms);
>  static DEVICE_ATTR_RO(region_mmio_size);
>  static DEVICE_ATTR_RO(region_dus_size);
> +static DEVICE_ATTR_RO(isolate);
> +static DEVICE_ATTR_RW(isolate_strategy);

No documentation for these new sysfs files?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v9 2/3] Documentation: add a isolation strategy sysfs node for uacce
  2022-10-25 12:39 ` [PATCH v9 2/3] Documentation: add a isolation strategy sysfs node for uacce Kai Ye
@ 2022-10-25 13:03   ` Greg KH
  0 siblings, 0 replies; 7+ messages in thread
From: Greg KH @ 2022-10-25 13:03 UTC (permalink / raw)
  To: Kai Ye
  Cc: linux-accelerators, linux-kernel, linuxarm, zhangfei.gao, wangzhou1

On Tue, Oct 25, 2022 at 12:39:30PM +0000, Kai Ye wrote:
> Update documentation describing sysfs node that could help to
> configure isolation strategy for users in the user space. And
> describing sysfs node that could read the device isolated state.
> 
> Signed-off-by: Kai Ye <yekai13@huawei.com>
> ---
>  Documentation/ABI/testing/sysfs-driver-uacce | 27 ++++++++++++++++++++
>  1 file changed, 27 insertions(+)
> 
> diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce
> index 08f2591138af..50737c897ba3 100644
> --- a/Documentation/ABI/testing/sysfs-driver-uacce
> +++ b/Documentation/ABI/testing/sysfs-driver-uacce
> @@ -19,6 +19,33 @@ Contact:        linux-accelerators@lists.ozlabs.org
>  Description:    Available instances left of the device
>                  Return -ENODEV if uacce_ops get_available_instances is not provided
>  
> +What:           /sys/class/uacce/<dev_name>/isolate_strategy
> +Date:           Oct 2022
> +KernelVersion:  6.1
> +Contact:        linux-accelerators@lists.ozlabs.org
> +Description:    (RW) Configure the frequency size for the hardware error
> +                isolation strategy. This unit is the number of times. Number

Number of times what?

> +                of occurrences in a period, also means threshold. If the number
> +                of device pci AER error exceeds the threshold in a time window,

What is the time window?

> +                the device is isolated. This size is a configured integer value.
> +                The default is 0. The maximum value is 65535.
> +
> +                In the hisilicon accelerator engine, first we will
> +                time-stamp every slot AER error. Then check the AER error log
> +                when the device AER error occurred. if the device slot AER error
> +                count exceeds the preset the number of times in one hour, the
> +                isolated state will be set to true. So the device will be
> +                isolated. And the AER error log that exceed one hour will be
> +                cleared.

This seems like a very hardware-specific implementation here.  And this
is supposed to be a generic class?

I feel this is getting really messy :(

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v9 1/3] uacce: supports device isolation feature
  2022-10-25 12:39 ` [PATCH v9 1/3] uacce: " Kai Ye
  2022-10-25 13:00   ` Greg KH
@ 2022-10-26  6:14   ` kernel test robot
  1 sibling, 0 replies; 7+ messages in thread
From: kernel test robot @ 2022-10-26  6:14 UTC (permalink / raw)
  To: Kai Ye, gregkh, linux-accelerators, linux-kernel, linuxarm,
	zhangfei.gao, wangzhou1
  Cc: oe-kbuild-all

[-- Attachment #1: Type: text/plain, Size: 2478 bytes --]

Hi Kai,

I love your patch! Yet something to improve:

[auto build test ERROR on char-misc/char-misc-testing]
[also build test ERROR on herbert-cryptodev-2.6/master herbert-crypto-2.6/master linus/master v6.1-rc2 next-20221025]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Kai-Ye/crypto-hisilicon-supports-device-isolation-feature/20221025-204846
patch link:    https://lore.kernel.org/r/20221025123931.42161-2-yekai13%40huawei.com
patch subject: [PATCH v9 1/3] uacce: supports device isolation feature
config: ia64-randconfig-r032-20221023 (attached as .config)
compiler: ia64-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/61d411000cbcdf78e2d97d993858680e173e9d6b
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Kai-Ye/crypto-hisilicon-supports-device-isolation-feature/20221025-204846
        git checkout 61d411000cbcdf78e2d97d993858680e173e9d6b
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=ia64 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All error/warnings (new ones prefixed by >>):

   In file included from drivers/crypto/hisilicon/sec2/sec_main.c:17:
>> include/linux/uacce.h:184:5: warning: no previous prototype for 'uacce_hw_err_isolate' [-Wmissing-prototypes]
     184 | int uacce_hw_err_isolate(struct uacce_device *uacce)
         |     ^~~~~~~~~~~~~~~~~~~~
--
   ia64-linux-ld: drivers/crypto/hisilicon/qm.o: in function `uacce_hw_err_isolate':
>> include/linux/uacce.h:187: multiple definition of `uacce_hw_err_isolate'; drivers/crypto/hisilicon/sec2/sec_main.o:include/linux/uacce.h:187: first defined here


vim +187 include/linux/uacce.h

   183	
 > 184	int uacce_hw_err_isolate(struct uacce_device *uacce)
   185	{
   186		return -EINVAL;
 > 187	}
   188	#endif /* CONFIG_UACCE */
   189	

-- 
0-DAY CI Kernel Test Service
https://01.org/lkp

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 26839 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-10-26  6:14 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-25 12:39 [PATCH v9 0/3] crypto: hisilicon - supports device isolation feature Kai Ye
2022-10-25 12:39 ` [PATCH v9 1/3] uacce: " Kai Ye
2022-10-25 13:00   ` Greg KH
2022-10-26  6:14   ` kernel test robot
2022-10-25 12:39 ` [PATCH v9 2/3] Documentation: add a isolation strategy sysfs node for uacce Kai Ye
2022-10-25 13:03   ` Greg KH
2022-10-25 12:39 ` [PATCH v9 3/3] crypto: hisilicon/qm - add the device isolation feature for acc Kai Ye

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).