All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case
@ 2019-10-21  9:08 Chen, Guchun
       [not found] ` <20191021090735.19696-1-guchun.chen-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Chen, Guchun @ 2019-10-21  9:08 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Zhang, Hawking, Li,
	Dennis, Grodzovsky, Andrey, Zhou1, Tao
  Cc: Li, Candice, Chen, Guchun

Reboot operation for ras recovery is one common debugfs
entry, which should get rid of ras_ctrl node and remove
ip dependence when inputting by user. So add one new
auto_reboot node in ras debugfs dir to achieve this.

Signed-off-by: Guchun Chen <guchun.chen@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 55 ++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6220394521e4..3adcd29feb5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -153,8 +153,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
 		op = 1;
 	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
 		op = 2;
-	else if (sscanf(str, "reboot %32s", block_name) == 1)
-		op = 3;
 	else if (str[0] && str[1] && str[2] && str[3])
 		/* ascii string, but commands are not matched. */
 		return -EINVAL;
@@ -223,7 +221,6 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
  * - 0: disable RAS on the block. Take ::head as its data.
  * - 1: enable RAS on the block. Take ::head as its data.
  * - 2: inject errors on the block. Take ::inject as its data.
- * - 3: reboot on unrecoverable error
  *
  * How to use the interface?
  * programs:
@@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
 		/* data.inject.address is offset instead of absolute gpu address */
 		ret = amdgpu_ras_error_inject(adev, &data.inject);
 		break;
-	case 3:
-		amdgpu_ras_get_context(adev)->reboot = true;
-		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -346,6 +340,46 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user
 	return ret == 1 ? size : -EIO;
 }
 
+/**
+ * DOC: AMDGPU RAS debugfs auto reboot interface
+ *
+ * After one uncorrectable error happens, GPU recovery will be scheduled.
+ * Due to the known problem in GPU recovery failing to bring GPU back, this
+ * interface provides one direct way to user to reboot system automatically
+ * in such case within ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery
+ * routine will never be called.
+ *
+ * Enable auto_reboot:
+ *
+ *	echo 1 > /sys/kernel/debug/dri/x/ras/auto_reboot
+ *
+ * Revert auto_reboot:
+ *
+ * 	echo 0 > /sys/kernel/debug/dri/x/ras/auto_reboot
+ *
+ */
+static ssize_t amdgpu_ras_debugfs_reboot_write(struct file *f,
+	const char __user *buf, size_t size, loff_t *pos)
+{
+	struct amdgpu_device *adev =
+		(struct amdgpu_device *)file_inode(f)->i_private;
+	char tmp[8] = {0};
+	int value = -1;
+
+	if (size != simple_write_to_buffer(tmp, sizeof(tmp), pos, buf, size))
+		return -EINVAL;
+
+	if (kstrtoint(tmp, 10, &value))
+		return -EINVAL;
+
+	if (value == 1)
+		amdgpu_ras_get_context(adev)->reboot = true;
+	else if (value == 0)
+		amdgpu_ras_get_context(adev)->reboot = false;
+
+	return size;
+}
+
 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
 	.owner = THIS_MODULE,
 	.read = NULL,
@@ -360,6 +394,13 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
 	.llseek = default_llseek
 };
 
+static const struct file_operations amdgpu_ras_debugfs_reboot_ops = {
+	.owner = THIS_MODULE,
+	.read = NULL,
+	.write = amdgpu_ras_debugfs_reboot_write,
+	.llseek = default_llseek
+};
+
 /**
  * DOC: AMDGPU RAS sysfs Error Count Interface
  *
@@ -1037,6 +1078,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
 				adev, &amdgpu_ras_debugfs_ctrl_ops);
 	debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
 				adev, &amdgpu_ras_debugfs_eeprom_ops);
+	debugfs_create_file("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
+				adev, &amdgpu_ras_debugfs_reboot_ops);
 }
 
 void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case
       [not found] ` <20191021090735.19696-1-guchun.chen-5C7GfCeVMHo@public.gmane.org>
@ 2019-10-21  9:11   ` Christian König
       [not found]     ` <d1558d15-a1dc-e370-1410-ebfcafd01618-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  2019-10-21 14:00   ` Deucher, Alexander
  1 sibling, 1 reply; 8+ messages in thread
From: Christian König @ 2019-10-21  9:11 UTC (permalink / raw)
  To: Chen, Guchun, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Zhang,
	Hawking, Li, Dennis, Grodzovsky, Andrey, Zhou1, Tao
  Cc: Li, Candice

Am 21.10.19 um 11:08 schrieb Chen, Guchun:
> Reboot operation for ras recovery is one common debugfs
> entry, which should get rid of ras_ctrl node and remove
> ip dependence when inputting by user. So add one new
> auto_reboot node in ras debugfs dir to achieve this.

We need some justification why this can't be a module parameter instead.

For example write something like we want to control reboot behavior on a 
per device basis.

Apart from that looks like a nice cleanup to me.

Regards,
Christian.

>
> Signed-off-by: Guchun Chen <guchun.chen@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 55 ++++++++++++++++++++++---
>   1 file changed, 49 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 6220394521e4..3adcd29feb5f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -153,8 +153,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
>   		op = 1;
>   	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
>   		op = 2;
> -	else if (sscanf(str, "reboot %32s", block_name) == 1)
> -		op = 3;
>   	else if (str[0] && str[1] && str[2] && str[3])
>   		/* ascii string, but commands are not matched. */
>   		return -EINVAL;
> @@ -223,7 +221,6 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
>    * - 0: disable RAS on the block. Take ::head as its data.
>    * - 1: enable RAS on the block. Take ::head as its data.
>    * - 2: inject errors on the block. Take ::inject as its data.
> - * - 3: reboot on unrecoverable error
>    *
>    * How to use the interface?
>    * programs:
> @@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
>   		/* data.inject.address is offset instead of absolute gpu address */
>   		ret = amdgpu_ras_error_inject(adev, &data.inject);
>   		break;
> -	case 3:
> -		amdgpu_ras_get_context(adev)->reboot = true;
> -		break;
>   	default:
>   		ret = -EINVAL;
>   		break;
> @@ -346,6 +340,46 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user
>   	return ret == 1 ? size : -EIO;
>   }
>   
> +/**
> + * DOC: AMDGPU RAS debugfs auto reboot interface
> + *
> + * After one uncorrectable error happens, GPU recovery will be scheduled.
> + * Due to the known problem in GPU recovery failing to bring GPU back, this
> + * interface provides one direct way to user to reboot system automatically
> + * in such case within ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery
> + * routine will never be called.
> + *
> + * Enable auto_reboot:
> + *
> + *	echo 1 > /sys/kernel/debug/dri/x/ras/auto_reboot
> + *
> + * Revert auto_reboot:
> + *
> + * 	echo 0 > /sys/kernel/debug/dri/x/ras/auto_reboot
> + *
> + */
> +static ssize_t amdgpu_ras_debugfs_reboot_write(struct file *f,
> +	const char __user *buf, size_t size, loff_t *pos)
> +{
> +	struct amdgpu_device *adev =
> +		(struct amdgpu_device *)file_inode(f)->i_private;
> +	char tmp[8] = {0};
> +	int value = -1;
> +
> +	if (size != simple_write_to_buffer(tmp, sizeof(tmp), pos, buf, size))
> +		return -EINVAL;
> +
> +	if (kstrtoint(tmp, 10, &value))
> +		return -EINVAL;
> +
> +	if (value == 1)
> +		amdgpu_ras_get_context(adev)->reboot = true;
> +	else if (value == 0)
> +		amdgpu_ras_get_context(adev)->reboot = false;
> +
> +	return size;
> +}
> +
>   static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
>   	.owner = THIS_MODULE,
>   	.read = NULL,
> @@ -360,6 +394,13 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
>   	.llseek = default_llseek
>   };
>   
> +static const struct file_operations amdgpu_ras_debugfs_reboot_ops = {
> +	.owner = THIS_MODULE,
> +	.read = NULL,
> +	.write = amdgpu_ras_debugfs_reboot_write,
> +	.llseek = default_llseek
> +};
> +
>   /**
>    * DOC: AMDGPU RAS sysfs Error Count Interface
>    *
> @@ -1037,6 +1078,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
>   				adev, &amdgpu_ras_debugfs_ctrl_ops);
>   	debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
>   				adev, &amdgpu_ras_debugfs_eeprom_ops);
> +	debugfs_create_file("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
> +				adev, &amdgpu_ras_debugfs_reboot_ops);
>   }
>   
>   void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case
       [not found]     ` <d1558d15-a1dc-e370-1410-ebfcafd01618-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2019-10-21  9:19       ` Chen, Guchun
  0 siblings, 0 replies; 8+ messages in thread
From: Chen, Guchun @ 2019-10-21  9:19 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Zhang, Hawking, Li, Dennis, Grodzovsky, Andrey, Zhou1, Tao
  Cc: Li, Candice

Thanks Christian.

Let me update commit body in v2.

Regards,
Guchun

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Monday, October 21, 2019 5:12 PM
To: Chen, Guchun <Guchun.Chen@amd.com>; amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>
Cc: Li, Candice <Candice.Li@amd.com>
Subject: Re: [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case

Am 21.10.19 um 11:08 schrieb Chen, Guchun:
> Reboot operation for ras recovery is one common debugfs entry, which 
> should get rid of ras_ctrl node and remove ip dependence when 
> inputting by user. So add one new auto_reboot node in ras debugfs dir 
> to achieve this.

We need some justification why this can't be a module parameter instead.

For example write something like we want to control reboot behavior on a per device basis.

Apart from that looks like a nice cleanup to me.

Regards,
Christian.

>
> Signed-off-by: Guchun Chen <guchun.chen@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 55 ++++++++++++++++++++++---
>   1 file changed, 49 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 6220394521e4..3adcd29feb5f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -153,8 +153,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
>   		op = 1;
>   	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
>   		op = 2;
> -	else if (sscanf(str, "reboot %32s", block_name) == 1)
> -		op = 3;
>   	else if (str[0] && str[1] && str[2] && str[3])
>   		/* ascii string, but commands are not matched. */
>   		return -EINVAL;
> @@ -223,7 +221,6 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
>    * - 0: disable RAS on the block. Take ::head as its data.
>    * - 1: enable RAS on the block. Take ::head as its data.
>    * - 2: inject errors on the block. Take ::inject as its data.
> - * - 3: reboot on unrecoverable error
>    *
>    * How to use the interface?
>    * programs:
> @@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
>   		/* data.inject.address is offset instead of absolute gpu address */
>   		ret = amdgpu_ras_error_inject(adev, &data.inject);
>   		break;
> -	case 3:
> -		amdgpu_ras_get_context(adev)->reboot = true;
> -		break;
>   	default:
>   		ret = -EINVAL;
>   		break;
> @@ -346,6 +340,46 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user
>   	return ret == 1 ? size : -EIO;
>   }
>   
> +/**
> + * DOC: AMDGPU RAS debugfs auto reboot interface
> + *
> + * After one uncorrectable error happens, GPU recovery will be scheduled.
> + * Due to the known problem in GPU recovery failing to bring GPU 
> +back, this
> + * interface provides one direct way to user to reboot system 
> +automatically
> + * in such case within ERREVENT_ATHUB_INTERRUPT generated. Normal GPU 
> +recovery
> + * routine will never be called.
> + *
> + * Enable auto_reboot:
> + *
> + *	echo 1 > /sys/kernel/debug/dri/x/ras/auto_reboot
> + *
> + * Revert auto_reboot:
> + *
> + * 	echo 0 > /sys/kernel/debug/dri/x/ras/auto_reboot
> + *
> + */
> +static ssize_t amdgpu_ras_debugfs_reboot_write(struct file *f,
> +	const char __user *buf, size_t size, loff_t *pos) {
> +	struct amdgpu_device *adev =
> +		(struct amdgpu_device *)file_inode(f)->i_private;
> +	char tmp[8] = {0};
> +	int value = -1;
> +
> +	if (size != simple_write_to_buffer(tmp, sizeof(tmp), pos, buf, size))
> +		return -EINVAL;
> +
> +	if (kstrtoint(tmp, 10, &value))
> +		return -EINVAL;
> +
> +	if (value == 1)
> +		amdgpu_ras_get_context(adev)->reboot = true;
> +	else if (value == 0)
> +		amdgpu_ras_get_context(adev)->reboot = false;
> +
> +	return size;
> +}
> +
>   static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
>   	.owner = THIS_MODULE,
>   	.read = NULL,
> @@ -360,6 +394,13 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
>   	.llseek = default_llseek
>   };
>   
> +static const struct file_operations amdgpu_ras_debugfs_reboot_ops = {
> +	.owner = THIS_MODULE,
> +	.read = NULL,
> +	.write = amdgpu_ras_debugfs_reboot_write,
> +	.llseek = default_llseek
> +};
> +
>   /**
>    * DOC: AMDGPU RAS sysfs Error Count Interface
>    *
> @@ -1037,6 +1078,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
>   				adev, &amdgpu_ras_debugfs_ctrl_ops);
>   	debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
>   				adev, &amdgpu_ras_debugfs_eeprom_ops);
> +	debugfs_create_file("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
> +				adev, &amdgpu_ras_debugfs_reboot_ops);
>   }
>   
>   void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case
       [not found] ` <20191021090735.19696-1-guchun.chen-5C7GfCeVMHo@public.gmane.org>
  2019-10-21  9:11   ` Christian König
@ 2019-10-21 14:00   ` Deucher, Alexander
  1 sibling, 0 replies; 8+ messages in thread
From: Deucher, Alexander @ 2019-10-21 14:00 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Zhang, Hawking, Li,
	Dennis, Grodzovsky, Andrey, Zhou1, Tao
  Cc: Li, Candice, Chen, Guchun

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> Chen, Guchun
> Sent: Monday, October 21, 2019 5:08 AM
> To: amd-gfx@lists.freedesktop.org; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Li, Dennis <Dennis.Li@amd.com>;
> Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Zhou1, Tao
> <Tao.Zhou1@amd.com>
> Cc: Li, Candice <Candice.Li@amd.com>; Chen, Guchun
> <Guchun.Chen@amd.com>
> Subject: [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case
> 
> Reboot operation for ras recovery is one common debugfs entry, which
> should get rid of ras_ctrl node and remove ip dependence when inputting by
> user. So add one new auto_reboot node in ras debugfs dir to achieve this.
> 
> Signed-off-by: Guchun Chen <guchun.chen@amd.com>

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 55
> ++++++++++++++++++++++---
>  1 file changed, 49 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 6220394521e4..3adcd29feb5f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -153,8 +153,6 @@ static int
> amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
>  		op = 1;
>  	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
>  		op = 2;
> -	else if (sscanf(str, "reboot %32s", block_name) == 1)
> -		op = 3;
>  	else if (str[0] && str[1] && str[2] && str[3])
>  		/* ascii string, but commands are not matched. */
>  		return -EINVAL;
> @@ -223,7 +221,6 @@ static struct ras_manager
> *amdgpu_ras_find_obj(struct amdgpu_device *adev,
>   * - 0: disable RAS on the block. Take ::head as its data.
>   * - 1: enable RAS on the block. Take ::head as its data.
>   * - 2: inject errors on the block. Take ::inject as its data.
> - * - 3: reboot on unrecoverable error
>   *
>   * How to use the interface?
>   * programs:
> @@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct
> file *f, const char __user *
>  		/* data.inject.address is offset instead of absolute gpu
> address */
>  		ret = amdgpu_ras_error_inject(adev, &data.inject);
>  		break;
> -	case 3:
> -		amdgpu_ras_get_context(adev)->reboot = true;
> -		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> @@ -346,6 +340,46 @@ static ssize_t
> amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user
>  	return ret == 1 ? size : -EIO;
>  }
> 
> +/**
> + * DOC: AMDGPU RAS debugfs auto reboot interface
> + *
> + * After one uncorrectable error happens, GPU recovery will be scheduled.
> + * Due to the known problem in GPU recovery failing to bring GPU back,
> +this
> + * interface provides one direct way to user to reboot system
> +automatically
> + * in such case within ERREVENT_ATHUB_INTERRUPT generated. Normal
> GPU
> +recovery
> + * routine will never be called.
> + *
> + * Enable auto_reboot:
> + *
> + *	echo 1 > /sys/kernel/debug/dri/x/ras/auto_reboot
> + *
> + * Revert auto_reboot:
> + *
> + * 	echo 0 > /sys/kernel/debug/dri/x/ras/auto_reboot
> + *
> + */
> +static ssize_t amdgpu_ras_debugfs_reboot_write(struct file *f,
> +	const char __user *buf, size_t size, loff_t *pos) {
> +	struct amdgpu_device *adev =
> +		(struct amdgpu_device *)file_inode(f)->i_private;
> +	char tmp[8] = {0};
> +	int value = -1;
> +
> +	if (size != simple_write_to_buffer(tmp, sizeof(tmp), pos, buf, size))
> +		return -EINVAL;
> +
> +	if (kstrtoint(tmp, 10, &value))
> +		return -EINVAL;
> +
> +	if (value == 1)
> +		amdgpu_ras_get_context(adev)->reboot = true;
> +	else if (value == 0)
> +		amdgpu_ras_get_context(adev)->reboot = false;
> +
> +	return size;
> +}
> +
>  static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
>  	.owner = THIS_MODULE,
>  	.read = NULL,
> @@ -360,6 +394,13 @@ static const struct file_operations
> amdgpu_ras_debugfs_eeprom_ops = {
>  	.llseek = default_llseek
>  };
> 
> +static const struct file_operations amdgpu_ras_debugfs_reboot_ops = {
> +	.owner = THIS_MODULE,
> +	.read = NULL,
> +	.write = amdgpu_ras_debugfs_reboot_write,
> +	.llseek = default_llseek
> +};
> +
>  /**
>   * DOC: AMDGPU RAS sysfs Error Count Interface
>   *
> @@ -1037,6 +1078,8 @@ static void
> amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
>  				adev, &amdgpu_ras_debugfs_ctrl_ops);
>  	debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO,
> con->dir,
>  				adev, &amdgpu_ras_debugfs_eeprom_ops);
> +	debugfs_create_file("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
> +				adev, &amdgpu_ras_debugfs_reboot_ops);
>  }
> 
>  void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
> --
> 2.17.1
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case
       [not found] ` <20191021093245.28945-1-guchun.chen-5C7GfCeVMHo@public.gmane.org>
  2019-10-21 11:13   ` Christian König
@ 2019-10-21 15:40   ` Deucher, Alexander
  1 sibling, 0 replies; 8+ messages in thread
From: Deucher, Alexander @ 2019-10-21 15:40 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Koenig, Christian,
	Zhang, Hawking, Li, Dennis, Grodzovsky, Andrey, Zhou1, Tao
  Cc: Li, Candice, Chen, Guchun

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> Chen, Guchun
> Sent: Monday, October 21, 2019 5:33 AM
> To: amd-gfx@lists.freedesktop.org; Koenig, Christian
> <Christian.Koenig@amd.com>; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Li, Dennis <Dennis.Li@amd.com>;
> Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Zhou1, Tao
> <Tao.Zhou1@amd.com>
> Cc: Li, Candice <Candice.Li@amd.com>; Chen, Guchun
> <Guchun.Chen@amd.com>
> Subject: [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case
> 
> Ras reboot debugfs node allows user one easy control to avoid gpu recovery
> hang problem and directly reboot system per card basis, after ras
> uncorrectable error happens. However, it is one common entry, which
> should get rid of ras_ctrl node and remove ip dependence when inputting by
> user. So add one new auto_reboot node in ras debugfs dir to achieve this.
> 
> v2: in commit mssage, add justification why ras reboot debugfs node is
> needed.
> 
> Signed-off-by: Guchun Chen <guchun.chen@amd.com>

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 57
> ++++++++++++++++++++++---
>  1 file changed, 50 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 6220394521e4..6450065b88de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -153,8 +153,6 @@ static int
> amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
>  		op = 1;
>  	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
>  		op = 2;
> -	else if (sscanf(str, "reboot %32s", block_name) == 1)
> -		op = 3;
>  	else if (str[0] && str[1] && str[2] && str[3])
>  		/* ascii string, but commands are not matched. */
>  		return -EINVAL;
> @@ -218,12 +216,11 @@ static struct ras_manager
> *amdgpu_ras_find_obj(struct amdgpu_device *adev,
>   * value to the address.
>   *
>   * Second member: struct ras_debug_if::op.
> - * It has four kinds of operations.
> + * It has three kinds of operations.
>   *
>   * - 0: disable RAS on the block. Take ::head as its data.
>   * - 1: enable RAS on the block. Take ::head as its data.
>   * - 2: inject errors on the block. Take ::inject as its data.
> - * - 3: reboot on unrecoverable error
>   *
>   * How to use the interface?
>   * programs:
> @@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct
> file *f, const char __user *
>  		/* data.inject.address is offset instead of absolute gpu
> address */
>  		ret = amdgpu_ras_error_inject(adev, &data.inject);
>  		break;
> -	case 3:
> -		amdgpu_ras_get_context(adev)->reboot = true;
> -		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> @@ -346,6 +340,46 @@ static ssize_t
> amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user
>  	return ret == 1 ? size : -EIO;
>  }
> 
> +/**
> + * DOC: AMDGPU RAS debugfs auto reboot interface
> + *
> + * After one uncorrectable error happens, GPU recovery will be scheduled.
> + * Due to the known problem in GPU recovery failing to bring GPU back,
> +this
> + * interface provides one direct way to user to reboot system
> +automatically
> + * in such case within ERREVENT_ATHUB_INTERRUPT generated. Normal
> GPU
> +recovery
> + * routine will never be called.
> + *
> + * Enable auto_reboot:
> + *
> + *	echo 1 > /sys/kernel/debug/dri/x/ras/auto_reboot
> + *
> + * Revert auto_reboot:
> + *
> + * 	echo 0 > /sys/kernel/debug/dri/x/ras/auto_reboot
> + *
> + */
> +static ssize_t amdgpu_ras_debugfs_reboot_write(struct file *f,
> +	const char __user *buf, size_t size, loff_t *pos) {
> +	struct amdgpu_device *adev =
> +		(struct amdgpu_device *)file_inode(f)->i_private;
> +	char tmp[8] = {0};
> +	int value = -1;
> +
> +	if (size != simple_write_to_buffer(tmp, sizeof(tmp), pos, buf, size))
> +		return -EINVAL;
> +
> +	if (kstrtoint(tmp, 10, &value))
> +		return -EINVAL;
> +
> +	if (value == 1)
> +		amdgpu_ras_get_context(adev)->reboot = true;
> +	else if (value == 0)
> +		amdgpu_ras_get_context(adev)->reboot = false;
> +
> +	return size;
> +}
> +
>  static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
>  	.owner = THIS_MODULE,
>  	.read = NULL,
> @@ -360,6 +394,13 @@ static const struct file_operations
> amdgpu_ras_debugfs_eeprom_ops = {
>  	.llseek = default_llseek
>  };
> 
> +static const struct file_operations amdgpu_ras_debugfs_reboot_ops = {
> +	.owner = THIS_MODULE,
> +	.read = NULL,
> +	.write = amdgpu_ras_debugfs_reboot_write,
> +	.llseek = default_llseek
> +};
> +
>  /**
>   * DOC: AMDGPU RAS sysfs Error Count Interface
>   *
> @@ -1037,6 +1078,8 @@ static void
> amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
>  				adev, &amdgpu_ras_debugfs_ctrl_ops);
>  	debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO,
> con->dir,
>  				adev, &amdgpu_ras_debugfs_eeprom_ops);
> +	debugfs_create_file("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
> +				adev, &amdgpu_ras_debugfs_reboot_ops);
>  }
> 
>  void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
> --
> 2.17.1
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case
       [not found]     ` <058c4a9d-1ae2-5721-4404-794e575b42a8-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2019-10-21 14:04       ` Chen, Guchun
  0 siblings, 0 replies; 8+ messages in thread
From: Chen, Guchun @ 2019-10-21 14:04 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Zhang, Hawking, Li, Dennis, Grodzovsky, Andrey, Zhou1, Tao
  Cc: Li, Candice

Thanks Christian.
Your suggestion is excellent, this can help get rid of parsing input code in this patch. I will prepare patch v3 soon.

Regards,
Guchun

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Monday, October 21, 2019 7:14 PM
To: Chen, Guchun <Guchun.Chen@amd.com>; amd-gfx@lists.freedesktop.org; Koenig, Christian <Christian.Koenig@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>
Cc: Li, Candice <Candice.Li@amd.com>
Subject: Re: [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case

Am 21.10.19 um 11:33 schrieb Chen, Guchun:
> Ras reboot debugfs node allows user one easy control to avoid gpu 
> recovery hang problem and directly reboot system per card basis, after 
> ras uncorrectable error happens. However, it is one common entry, 
> which should get rid of ras_ctrl node and remove ip dependence when 
> inputting by user. So add one new auto_reboot node in ras debugfs dir 
> to achieve this.
>
> v2: in commit mssage, add justification why ras reboot debugfs node is 
> needed.
>
> Signed-off-by: Guchun Chen <guchun.chen@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 57 ++++++++++++++++++++++---
>   1 file changed, 50 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 6220394521e4..6450065b88de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -153,8 +153,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
>   		op = 1;
>   	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
>   		op = 2;
> -	else if (sscanf(str, "reboot %32s", block_name) == 1)
> -		op = 3;
>   	else if (str[0] && str[1] && str[2] && str[3])
>   		/* ascii string, but commands are not matched. */
>   		return -EINVAL;
> @@ -218,12 +216,11 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
>    * value to the address.
>    *
>    * Second member: struct ras_debug_if::op.
> - * It has four kinds of operations.
> + * It has three kinds of operations.
>    *
>    * - 0: disable RAS on the block. Take ::head as its data.
>    * - 1: enable RAS on the block. Take ::head as its data.
>    * - 2: inject errors on the block. Take ::inject as its data.
> - * - 3: reboot on unrecoverable error
>    *
>    * How to use the interface?
>    * programs:
> @@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
>   		/* data.inject.address is offset instead of absolute gpu address */
>   		ret = amdgpu_ras_error_inject(adev, &data.inject);
>   		break;
> -	case 3:
> -		amdgpu_ras_get_context(adev)->reboot = true;
> -		break;
>   	default:
>   		ret = -EINVAL;
>   		break;
> @@ -346,6 +340,46 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user
>   	return ret == 1 ? size : -EIO;
>   }
>   
> +/**
> + * DOC: AMDGPU RAS debugfs auto reboot interface
> + *
> + * After one uncorrectable error happens, GPU recovery will be scheduled.
> + * Due to the known problem in GPU recovery failing to bring GPU 
> +back, this
> + * interface provides one direct way to user to reboot system 
> +automatically
> + * in such case within ERREVENT_ATHUB_INTERRUPT generated. Normal GPU 
> +recovery
> + * routine will never be called.
> + *
> + * Enable auto_reboot:
> + *
> + *	echo 1 > /sys/kernel/debug/dri/x/ras/auto_reboot
> + *
> + * Revert auto_reboot:
> + *
> + * 	echo 0 > /sys/kernel/debug/dri/x/ras/auto_reboot
> + *
> + */
> +static ssize_t amdgpu_ras_debugfs_reboot_write(struct file *f,
> +	const char __user *buf, size_t size, loff_t *pos) {
> +	struct amdgpu_device *adev =
> +		(struct amdgpu_device *)file_inode(f)->i_private;
> +	char tmp[8] = {0};
> +	int value = -1;
> +
> +	if (size != simple_write_to_buffer(tmp, sizeof(tmp), pos, buf, size))
> +		return -EINVAL;
> +
> +	if (kstrtoint(tmp, 10, &value))
> +		return -EINVAL;
> +
> +	if (value == 1)
> +		amdgpu_ras_get_context(adev)->reboot = true;
> +	else if (value == 0)
> +		amdgpu_ras_get_context(adev)->reboot = false;
> +
> +	return size;
> +}
> +
>   static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
>   	.owner = THIS_MODULE,
>   	.read = NULL,
> @@ -360,6 +394,13 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
>   	.llseek = default_llseek
>   };
>   
> +static const struct file_operations amdgpu_ras_debugfs_reboot_ops = {
> +	.owner = THIS_MODULE,
> +	.read = NULL,
> +	.write = amdgpu_ras_debugfs_reboot_write,
> +	.llseek = default_llseek
> +};
> +
>   /**
>    * DOC: AMDGPU RAS sysfs Error Count Interface
>    *
> @@ -1037,6 +1078,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
>   				adev, &amdgpu_ras_debugfs_ctrl_ops);
>   	debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
>   				adev, &amdgpu_ras_debugfs_eeprom_ops);
> +	debugfs_create_file("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
> +				adev, &amdgpu_ras_debugfs_reboot_ops);

Since we don't need to run any extra checking I think you could simplify the patch greatly by using debugfs_create_bool() here. E.g. just use:

debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, con->dir, &con->reboot);

And you are done with that.

Regards,
Christian.

>   }
>   
>   void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case
       [not found] ` <20191021093245.28945-1-guchun.chen-5C7GfCeVMHo@public.gmane.org>
@ 2019-10-21 11:13   ` Christian König
       [not found]     ` <058c4a9d-1ae2-5721-4404-794e575b42a8-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  2019-10-21 15:40   ` Deucher, Alexander
  1 sibling, 1 reply; 8+ messages in thread
From: Christian König @ 2019-10-21 11:13 UTC (permalink / raw)
  To: Chen, Guchun, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Koenig,
	Christian, Zhang, Hawking, Li, Dennis, Grodzovsky, Andrey, Zhou1,
	Tao
  Cc: Li, Candice

Am 21.10.19 um 11:33 schrieb Chen, Guchun:
> Ras reboot debugfs node allows user one easy control to avoid
> gpu recovery hang problem and directly reboot system per card
> basis, after ras uncorrectable error happens. However, it is
> one common entry, which should get rid of ras_ctrl node and
> remove ip dependence when inputting by user. So add one new
> auto_reboot node in ras debugfs dir to achieve this.
>
> v2: in commit mssage, add justification why ras reboot debugfs
> node is needed.
>
> Signed-off-by: Guchun Chen <guchun.chen@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 57 ++++++++++++++++++++++---
>   1 file changed, 50 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 6220394521e4..6450065b88de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -153,8 +153,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
>   		op = 1;
>   	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
>   		op = 2;
> -	else if (sscanf(str, "reboot %32s", block_name) == 1)
> -		op = 3;
>   	else if (str[0] && str[1] && str[2] && str[3])
>   		/* ascii string, but commands are not matched. */
>   		return -EINVAL;
> @@ -218,12 +216,11 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
>    * value to the address.
>    *
>    * Second member: struct ras_debug_if::op.
> - * It has four kinds of operations.
> + * It has three kinds of operations.
>    *
>    * - 0: disable RAS on the block. Take ::head as its data.
>    * - 1: enable RAS on the block. Take ::head as its data.
>    * - 2: inject errors on the block. Take ::inject as its data.
> - * - 3: reboot on unrecoverable error
>    *
>    * How to use the interface?
>    * programs:
> @@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
>   		/* data.inject.address is offset instead of absolute gpu address */
>   		ret = amdgpu_ras_error_inject(adev, &data.inject);
>   		break;
> -	case 3:
> -		amdgpu_ras_get_context(adev)->reboot = true;
> -		break;
>   	default:
>   		ret = -EINVAL;
>   		break;
> @@ -346,6 +340,46 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user
>   	return ret == 1 ? size : -EIO;
>   }
>   
> +/**
> + * DOC: AMDGPU RAS debugfs auto reboot interface
> + *
> + * After one uncorrectable error happens, GPU recovery will be scheduled.
> + * Due to the known problem in GPU recovery failing to bring GPU back, this
> + * interface provides one direct way to user to reboot system automatically
> + * in such case within ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery
> + * routine will never be called.
> + *
> + * Enable auto_reboot:
> + *
> + *	echo 1 > /sys/kernel/debug/dri/x/ras/auto_reboot
> + *
> + * Revert auto_reboot:
> + *
> + * 	echo 0 > /sys/kernel/debug/dri/x/ras/auto_reboot
> + *
> + */
> +static ssize_t amdgpu_ras_debugfs_reboot_write(struct file *f,
> +	const char __user *buf, size_t size, loff_t *pos)
> +{
> +	struct amdgpu_device *adev =
> +		(struct amdgpu_device *)file_inode(f)->i_private;
> +	char tmp[8] = {0};
> +	int value = -1;
> +
> +	if (size != simple_write_to_buffer(tmp, sizeof(tmp), pos, buf, size))
> +		return -EINVAL;
> +
> +	if (kstrtoint(tmp, 10, &value))
> +		return -EINVAL;
> +
> +	if (value == 1)
> +		amdgpu_ras_get_context(adev)->reboot = true;
> +	else if (value == 0)
> +		amdgpu_ras_get_context(adev)->reboot = false;
> +
> +	return size;
> +}
> +
>   static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
>   	.owner = THIS_MODULE,
>   	.read = NULL,
> @@ -360,6 +394,13 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
>   	.llseek = default_llseek
>   };
>   
> +static const struct file_operations amdgpu_ras_debugfs_reboot_ops = {
> +	.owner = THIS_MODULE,
> +	.read = NULL,
> +	.write = amdgpu_ras_debugfs_reboot_write,
> +	.llseek = default_llseek
> +};
> +
>   /**
>    * DOC: AMDGPU RAS sysfs Error Count Interface
>    *
> @@ -1037,6 +1078,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
>   				adev, &amdgpu_ras_debugfs_ctrl_ops);
>   	debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
>   				adev, &amdgpu_ras_debugfs_eeprom_ops);
> +	debugfs_create_file("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
> +				adev, &amdgpu_ras_debugfs_reboot_ops);

Since we don't need to run any extra checking I think you could simplify 
the patch greatly by using debugfs_create_bool() here. E.g. just use:

debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, con->dir, 
&con->reboot);

And you are done with that.

Regards,
Christian.

>   }
>   
>   void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case
@ 2019-10-21  9:33 Chen, Guchun
       [not found] ` <20191021093245.28945-1-guchun.chen-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Chen, Guchun @ 2019-10-21  9:33 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Koenig, Christian,
	Zhang, Hawking, Li, Dennis, Grodzovsky, Andrey, Zhou1, Tao
  Cc: Li, Candice, Chen, Guchun

Ras reboot debugfs node allows user one easy control to avoid
gpu recovery hang problem and directly reboot system per card
basis, after ras uncorrectable error happens. However, it is
one common entry, which should get rid of ras_ctrl node and
remove ip dependence when inputting by user. So add one new
auto_reboot node in ras debugfs dir to achieve this.

v2: in commit mssage, add justification why ras reboot debugfs
node is needed.

Signed-off-by: Guchun Chen <guchun.chen@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 57 ++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6220394521e4..6450065b88de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -153,8 +153,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
 		op = 1;
 	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
 		op = 2;
-	else if (sscanf(str, "reboot %32s", block_name) == 1)
-		op = 3;
 	else if (str[0] && str[1] && str[2] && str[3])
 		/* ascii string, but commands are not matched. */
 		return -EINVAL;
@@ -218,12 +216,11 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
  * value to the address.
  *
  * Second member: struct ras_debug_if::op.
- * It has four kinds of operations.
+ * It has three kinds of operations.
  *
  * - 0: disable RAS on the block. Take ::head as its data.
  * - 1: enable RAS on the block. Take ::head as its data.
  * - 2: inject errors on the block. Take ::inject as its data.
- * - 3: reboot on unrecoverable error
  *
  * How to use the interface?
  * programs:
@@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
 		/* data.inject.address is offset instead of absolute gpu address */
 		ret = amdgpu_ras_error_inject(adev, &data.inject);
 		break;
-	case 3:
-		amdgpu_ras_get_context(adev)->reboot = true;
-		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -346,6 +340,46 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user
 	return ret == 1 ? size : -EIO;
 }
 
+/**
+ * DOC: AMDGPU RAS debugfs auto reboot interface
+ *
+ * After one uncorrectable error happens, GPU recovery will be scheduled.
+ * Due to the known problem in GPU recovery failing to bring GPU back, this
+ * interface provides one direct way to user to reboot system automatically
+ * in such case within ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery
+ * routine will never be called.
+ *
+ * Enable auto_reboot:
+ *
+ *	echo 1 > /sys/kernel/debug/dri/x/ras/auto_reboot
+ *
+ * Revert auto_reboot:
+ *
+ * 	echo 0 > /sys/kernel/debug/dri/x/ras/auto_reboot
+ *
+ */
+static ssize_t amdgpu_ras_debugfs_reboot_write(struct file *f,
+	const char __user *buf, size_t size, loff_t *pos)
+{
+	struct amdgpu_device *adev =
+		(struct amdgpu_device *)file_inode(f)->i_private;
+	char tmp[8] = {0};
+	int value = -1;
+
+	if (size != simple_write_to_buffer(tmp, sizeof(tmp), pos, buf, size))
+		return -EINVAL;
+
+	if (kstrtoint(tmp, 10, &value))
+		return -EINVAL;
+
+	if (value == 1)
+		amdgpu_ras_get_context(adev)->reboot = true;
+	else if (value == 0)
+		amdgpu_ras_get_context(adev)->reboot = false;
+
+	return size;
+}
+
 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
 	.owner = THIS_MODULE,
 	.read = NULL,
@@ -360,6 +394,13 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
 	.llseek = default_llseek
 };
 
+static const struct file_operations amdgpu_ras_debugfs_reboot_ops = {
+	.owner = THIS_MODULE,
+	.read = NULL,
+	.write = amdgpu_ras_debugfs_reboot_write,
+	.llseek = default_llseek
+};
+
 /**
  * DOC: AMDGPU RAS sysfs Error Count Interface
  *
@@ -1037,6 +1078,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
 				adev, &amdgpu_ras_debugfs_ctrl_ops);
 	debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
 				adev, &amdgpu_ras_debugfs_eeprom_ops);
+	debugfs_create_file("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
+				adev, &amdgpu_ras_debugfs_reboot_ops);
 }
 
 void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2019-10-21 15:40 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-21  9:08 [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case Chen, Guchun
     [not found] ` <20191021090735.19696-1-guchun.chen-5C7GfCeVMHo@public.gmane.org>
2019-10-21  9:11   ` Christian König
     [not found]     ` <d1558d15-a1dc-e370-1410-ebfcafd01618-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-10-21  9:19       ` Chen, Guchun
2019-10-21 14:00   ` Deucher, Alexander
2019-10-21  9:33 Chen, Guchun
     [not found] ` <20191021093245.28945-1-guchun.chen-5C7GfCeVMHo@public.gmane.org>
2019-10-21 11:13   ` Christian König
     [not found]     ` <058c4a9d-1ae2-5721-4404-794e575b42a8-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-10-21 14:04       ` Chen, Guchun
2019-10-21 15:40   ` Deucher, Alexander

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.