All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] drm/amdgpu: add badpages sysfs interafce
@ 2019-05-08  3:15 Pan, Xinhui
       [not found] ` <SN6PR12MB280057754F6035815F41314887320-kxOKjb6HO/EqkY47FTA1ogdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 2+ messages in thread
From: Pan, Xinhui @ 2019-05-08  3:15 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Deucher, Alexander

add badpages node.
it will output badpages list in format
page : size  : flags

page is PFN.
flags can be R, P, F.

example
0x00000000 : 0x00001000 : R
0x00000001 : 0x00001000 : R
0x00000002 : 0x00001000 : R
0x00000003 : 0x00001000 : R
0x00000004 : 0x00001000 : R
0x00000005 : 0x00001000 : R
0x00000006 : 0x00001000 : R
0x00000007 : 0x00001000 : P
0x00000008 : 0x00001000 : P
0x00000009 : 0x00001000 : P

R: reserved.
P: pending
F: failed to reserve for some reason.

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 133 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |   1 +
 2 files changed, 134 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 22bd21efe6b1..2e9fb785019d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -90,6 +90,12 @@ struct ras_manager {
 	struct ras_err_data err_data;
 };
 
+struct ras_badpage {
+	unsigned int bp;
+	unsigned int size;
+	unsigned int flags;
+};
+
 const char *ras_error_string[] = {
 	"none",
 	"parity",
@@ -691,6 +697,62 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 
 /* sysfs begin */
 
+static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
+		struct ras_badpage **bps, unsigned int *count);
+
+static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
+{
+	switch (flags) {
+	case 0:
+		return "R";
+	case 1:
+		return "P";
+	case 2:
+	default:
+		return "F";
+	};
+}
+
+/*
+ * format: start - end : R|P|F
+ * start, end: page frame number, end is not included.
+ * R: reserved
+ * P: pedning for reserve
+ * F: unable to reserve.
+ */
+
+static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
+		struct kobject *kobj, struct bin_attribute *attr,
+		char *buf, loff_t ppos, size_t count)
+{
+	struct amdgpu_ras *con =
+		container_of(attr, struct amdgpu_ras, badpages_attr);
+	struct amdgpu_device *adev = con->adev;
+	const unsigned int element_size =
+		sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
+	unsigned int start = (ppos + element_size - 1) / element_size;
+	unsigned int end = (ppos + count - 1) / element_size;
+	ssize_t s = 0;
+	struct ras_badpage *bps = NULL;
+	unsigned int bps_count = 0;
+
+	memset(buf, 0, count);
+
+	if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
+		return 0;
+
+	for (; start < end && start < bps_count; start++)
+		s += scnprintf(&buf[s], element_size + 1,
+				"0x%08x : 0x%08x : %1s\n",
+				bps[start].bp,
+				bps[start].size,
+				amdgpu_ras_badpage_flags_str(bps[start].flags));
+
+	kfree(bps);
+
+	return s;
+}
+
 static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -731,9 +793,14 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
 		&con->features_attr.attr,
 		NULL
 	};
+	struct bin_attribute *bin_attrs[] = {
+		&con->badpages_attr,
+		NULL
+	};
 	struct attribute_group group = {
 		.name = "ras",
 		.attrs = attrs,
+		.bin_attrs = bin_attrs,
 	};
 
 	con->features_attr = (struct device_attribute) {
@@ -743,7 +810,19 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
 		},
 			.show = amdgpu_ras_sysfs_features_read,
 	};
+
+	con->badpages_attr = (struct bin_attribute) {
+		.attr = {
+			.name = "umc_badpages",
+			.mode = S_IRUGO,
+		},
+		.size = 0,
+		.private = NULL,
+		.read = amdgpu_ras_sysfs_badpages_read,
+	};
+
 	sysfs_attr_init(attrs[0]);
+	sysfs_bin_attr_init(bin_attrs[0]);
 
 	return sysfs_create_group(&adev->dev->kobj, &group);
 }
@@ -755,9 +834,14 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
 		&con->features_attr.attr,
 		NULL
 	};
+	struct bin_attribute *bin_attrs[] = {
+		&con->badpages_attr,
+		NULL
+	};
 	struct attribute_group group = {
 		.name = "ras",
 		.attrs = attrs,
+		.bin_attrs = bin_attrs,
 	};
 
 	sysfs_remove_group(&adev->dev->kobj, &group);
@@ -1089,6 +1173,55 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
 /* ih end */
 
 /* recovery begin */
+
+/* return 0 on success.
+ * caller need free bps.
+ */
+static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
+		struct ras_badpage **bps, unsigned int *count)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_err_handler_data *data;
+	int i = 0;
+	int ret = 0;
+
+	if (!con || !con->eh_data || !bps || !count)
+		return -EINVAL;
+
+	mutex_lock(&con->recovery_lock);
+	data = con->eh_data;
+	if (!data || data->count == 0) {
+		*bps = NULL;
+		goto out;
+	}
+
+	*bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
+	if (!*bps) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	/* TODO
+	 * We can combine N nearby pages into one entry with size * N.
+	 */
+	for (; i < data->count; i++) {
+		(*bps)[i] = (struct ras_badpage){
+			.bp = data->bps[i].bp,
+			.size = AMDGPU_GPU_PAGE_SIZE,
+			.flags = 0,
+		};
+
+		if (data->last_reserved <= i)
+			(*bps)[i].flags = 1;
+		else if (data->bps[i].bo == NULL)
+			(*bps)[i].flags = 2;
+	}
+
+	*count = data->count;
+out:
+	mutex_unlock(&con->recovery_lock);
+	return ret;
+}
+
 static void amdgpu_ras_do_recovery(struct work_struct *work)
 {
 	struct amdgpu_ras *ras =
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index eaef5edefc34..600f735d9201 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -93,6 +93,7 @@ struct amdgpu_ras {
 	struct dentry *ent;
 	/* sysfs */
 	struct device_attribute features_attr;
+	struct bin_attribute badpages_attr;
 	/* block array */
 	struct ras_manager *objs;
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH v2] drm/amdgpu: add badpages sysfs interafce
       [not found] ` <SN6PR12MB280057754F6035815F41314887320-kxOKjb6HO/EqkY47FTA1ogdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2019-05-08 19:10   ` Alex Deucher
  0 siblings, 0 replies; 2+ messages in thread
From: Alex Deucher @ 2019-05-08 19:10 UTC (permalink / raw)
  To: Pan, Xinhui; +Cc: Deucher, Alexander, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On Tue, May 7, 2019 at 11:15 PM Pan, Xinhui <Xinhui.Pan@amd.com> wrote:
>
> add badpages node.
> it will output badpages list in format
> page : size  : flags

gpu pfn : gpu page size : flags

>
> page is PFN.
> flags can be R, P, F.
>
> example
> 0x00000000 : 0x00001000 : R
> 0x00000001 : 0x00001000 : R
> 0x00000002 : 0x00001000 : R
> 0x00000003 : 0x00001000 : R
> 0x00000004 : 0x00001000 : R
> 0x00000005 : 0x00001000 : R
> 0x00000006 : 0x00001000 : R
> 0x00000007 : 0x00001000 : P
> 0x00000008 : 0x00001000 : P
> 0x00000009 : 0x00001000 : P
>
> R: reserved.
> P: pending
> F: failed to reserve for some reason.
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 133 ++++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |   1 +
>  2 files changed, 134 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 22bd21efe6b1..2e9fb785019d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -90,6 +90,12 @@ struct ras_manager {
>         struct ras_err_data err_data;
>  };
>
> +struct ras_badpage {
> +       unsigned int bp;
> +       unsigned int size;
> +       unsigned int flags;
> +};
> +
>  const char *ras_error_string[] = {
>         "none",
>         "parity",
> @@ -691,6 +697,62 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
>
>  /* sysfs begin */
>
> +static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
> +               struct ras_badpage **bps, unsigned int *count);
> +
> +static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
> +{
> +       switch (flags) {
> +       case 0:
> +               return "R";
> +       case 1:
> +               return "P";
> +       case 2:
> +       default:
> +               return "F";
> +       };
> +}
> +
> +/*
> + * format: start - end : R|P|F
> + * start, end: page frame number, end is not included.
> + * R: reserved
> + * P: pedning for reserve

pending

> + * F: unable to reserve.
> + */
> +
> +static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
> +               struct kobject *kobj, struct bin_attribute *attr,
> +               char *buf, loff_t ppos, size_t count)
> +{
> +       struct amdgpu_ras *con =
> +               container_of(attr, struct amdgpu_ras, badpages_attr);
> +       struct amdgpu_device *adev = con->adev;
> +       const unsigned int element_size =
> +               sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
> +       unsigned int start = (ppos + element_size - 1) / element_size;
> +       unsigned int end = (ppos + count - 1) / element_size;
> +       ssize_t s = 0;
> +       struct ras_badpage *bps = NULL;
> +       unsigned int bps_count = 0;
> +
> +       memset(buf, 0, count);
> +
> +       if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
> +               return 0;
> +
> +       for (; start < end && start < bps_count; start++)
> +               s += scnprintf(&buf[s], element_size + 1,
> +                               "0x%08x : 0x%08x : %1s\n",
> +                               bps[start].bp,
> +                               bps[start].size,
> +                               amdgpu_ras_badpage_flags_str(bps[start].flags));
> +
> +       kfree(bps);
> +
> +       return s;
> +}
> +
>  static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
>                 struct device_attribute *attr, char *buf)
>  {
> @@ -731,9 +793,14 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
>                 &con->features_attr.attr,
>                 NULL
>         };
> +       struct bin_attribute *bin_attrs[] = {
> +               &con->badpages_attr,
> +               NULL
> +       };
>         struct attribute_group group = {
>                 .name = "ras",
>                 .attrs = attrs,
> +               .bin_attrs = bin_attrs,
>         };
>
>         con->features_attr = (struct device_attribute) {
> @@ -743,7 +810,19 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
>                 },
>                         .show = amdgpu_ras_sysfs_features_read,
>         };
> +
> +       con->badpages_attr = (struct bin_attribute) {
> +               .attr = {
> +                       .name = "umc_badpages",

How about "gpu_vram_bad_pages"?

> +                       .mode = S_IRUGO,
> +               },
> +               .size = 0,
> +               .private = NULL,
> +               .read = amdgpu_ras_sysfs_badpages_read,
> +       };
> +
>         sysfs_attr_init(attrs[0]);
> +       sysfs_bin_attr_init(bin_attrs[0]);
>
>         return sysfs_create_group(&adev->dev->kobj, &group);
>  }
> @@ -755,9 +834,14 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
>                 &con->features_attr.attr,
>                 NULL
>         };
> +       struct bin_attribute *bin_attrs[] = {
> +               &con->badpages_attr,
> +               NULL
> +       };
>         struct attribute_group group = {
>                 .name = "ras",
>                 .attrs = attrs,
> +               .bin_attrs = bin_attrs,
>         };
>
>         sysfs_remove_group(&adev->dev->kobj, &group);
> @@ -1089,6 +1173,55 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
>  /* ih end */
>
>  /* recovery begin */
> +
> +/* return 0 on success.
> + * caller need free bps.
> + */
> +static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
> +               struct ras_badpage **bps, unsigned int *count)
> +{
> +       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +       struct ras_err_handler_data *data;
> +       int i = 0;
> +       int ret = 0;
> +
> +       if (!con || !con->eh_data || !bps || !count)
> +               return -EINVAL;
> +
> +       mutex_lock(&con->recovery_lock);
> +       data = con->eh_data;
> +       if (!data || data->count == 0) {
> +               *bps = NULL;
> +               goto out;
> +       }
> +
> +       *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
> +       if (!*bps) {
> +               ret = -ENOMEM;
> +               goto out;
> +       }
> +       /* TODO
> +        * We can combine N nearby pages into one entry with size * N.
> +        */
> +       for (; i < data->count; i++) {
> +               (*bps)[i] = (struct ras_badpage){
> +                       .bp = data->bps[i].bp,
> +                       .size = AMDGPU_GPU_PAGE_SIZE,
> +                       .flags = 0,
> +               };
> +
> +               if (data->last_reserved <= i)
> +                       (*bps)[i].flags = 1;
> +               else if (data->bps[i].bo == NULL)
> +                       (*bps)[i].flags = 2;
> +       }
> +
> +       *count = data->count;
> +out:
> +       mutex_unlock(&con->recovery_lock);
> +       return ret;
> +}
> +

Please add a DOC section describing the sysfs interfaces as well or
update the existing DOC section to cover the sysfs files as well.

Alex

>  static void amdgpu_ras_do_recovery(struct work_struct *work)
>  {
>         struct amdgpu_ras *ras =
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index eaef5edefc34..600f735d9201 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -93,6 +93,7 @@ struct amdgpu_ras {
>         struct dentry *ent;
>         /* sysfs */
>         struct device_attribute features_attr;
> +       struct bin_attribute badpages_attr;
>         /* block array */
>         struct ras_manager *objs;
>
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2019-05-08 19:10 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-08  3:15 [PATCH v2] drm/amdgpu: add badpages sysfs interafce Pan, Xinhui
     [not found] ` <SN6PR12MB280057754F6035815F41314887320-kxOKjb6HO/EqkY47FTA1ogdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-05-08 19:10   ` Alex Deucher

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.