All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case
@ 2019-10-21  9:33 Chen, Guchun
       [not found] ` <20191021093245.28945-1-guchun.chen-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Chen, Guchun @ 2019-10-21  9:33 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Koenig, Christian,
	Zhang, Hawking, Li, Dennis, Grodzovsky, Andrey, Zhou1, Tao
  Cc: Li, Candice, Chen, Guchun

Ras reboot debugfs node allows user one easy control to avoid
gpu recovery hang problem and directly reboot system per card
basis, after ras uncorrectable error happens. However, it is
one common entry, which should get rid of ras_ctrl node and
remove ip dependence when inputting by user. So add one new
auto_reboot node in ras debugfs dir to achieve this.

v2: in commit mssage, add justification why ras reboot debugfs
node is needed.

Signed-off-by: Guchun Chen <guchun.chen@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 57 ++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6220394521e4..6450065b88de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -153,8 +153,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
 		op = 1;
 	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
 		op = 2;
-	else if (sscanf(str, "reboot %32s", block_name) == 1)
-		op = 3;
 	else if (str[0] && str[1] && str[2] && str[3])
 		/* ascii string, but commands are not matched. */
 		return -EINVAL;
@@ -218,12 +216,11 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
  * value to the address.
  *
  * Second member: struct ras_debug_if::op.
- * It has four kinds of operations.
+ * It has three kinds of operations.
  *
  * - 0: disable RAS on the block. Take ::head as its data.
  * - 1: enable RAS on the block. Take ::head as its data.
  * - 2: inject errors on the block. Take ::inject as its data.
- * - 3: reboot on unrecoverable error
  *
  * How to use the interface?
  * programs:
@@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
 		/* data.inject.address is offset instead of absolute gpu address */
 		ret = amdgpu_ras_error_inject(adev, &data.inject);
 		break;
-	case 3:
-		amdgpu_ras_get_context(adev)->reboot = true;
-		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -346,6 +340,46 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user
 	return ret == 1 ? size : -EIO;
 }
 
+/**
+ * DOC: AMDGPU RAS debugfs auto reboot interface
+ *
+ * After one uncorrectable error happens, GPU recovery will be scheduled.
+ * Due to the known problem in GPU recovery failing to bring GPU back, this
+ * interface provides one direct way to user to reboot system automatically
+ * in such case within ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery
+ * routine will never be called.
+ *
+ * Enable auto_reboot:
+ *
+ *	echo 1 > /sys/kernel/debug/dri/x/ras/auto_reboot
+ *
+ * Revert auto_reboot:
+ *
+ * 	echo 0 > /sys/kernel/debug/dri/x/ras/auto_reboot
+ *
+ */
+static ssize_t amdgpu_ras_debugfs_reboot_write(struct file *f,
+	const char __user *buf, size_t size, loff_t *pos)
+{
+	struct amdgpu_device *adev =
+		(struct amdgpu_device *)file_inode(f)->i_private;
+	char tmp[8] = {0};
+	int value = -1;
+
+	if (size != simple_write_to_buffer(tmp, sizeof(tmp), pos, buf, size))
+		return -EINVAL;
+
+	if (kstrtoint(tmp, 10, &value))
+		return -EINVAL;
+
+	if (value == 1)
+		amdgpu_ras_get_context(adev)->reboot = true;
+	else if (value == 0)
+		amdgpu_ras_get_context(adev)->reboot = false;
+
+	return size;
+}
+
 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
 	.owner = THIS_MODULE,
 	.read = NULL,
@@ -360,6 +394,13 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
 	.llseek = default_llseek
 };
 
+static const struct file_operations amdgpu_ras_debugfs_reboot_ops = {
+	.owner = THIS_MODULE,
+	.read = NULL,
+	.write = amdgpu_ras_debugfs_reboot_write,
+	.llseek = default_llseek
+};
+
 /**
  * DOC: AMDGPU RAS sysfs Error Count Interface
  *
@@ -1037,6 +1078,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
 				adev, &amdgpu_ras_debugfs_ctrl_ops);
 	debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
 				adev, &amdgpu_ras_debugfs_eeprom_ops);
+	debugfs_create_file("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
+				adev, &amdgpu_ras_debugfs_reboot_ops);
 }
 
 void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread
* [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case
@ 2019-10-21  9:08 Chen, Guchun
       [not found] ` <20191021090735.19696-1-guchun.chen-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Chen, Guchun @ 2019-10-21  9:08 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Zhang, Hawking, Li,
	Dennis, Grodzovsky, Andrey, Zhou1, Tao
  Cc: Li, Candice, Chen, Guchun

Reboot operation for ras recovery is one common debugfs
entry, which should get rid of ras_ctrl node and remove
ip dependence when inputting by user. So add one new
auto_reboot node in ras debugfs dir to achieve this.

Signed-off-by: Guchun Chen <guchun.chen@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 55 ++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6220394521e4..3adcd29feb5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -153,8 +153,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
 		op = 1;
 	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
 		op = 2;
-	else if (sscanf(str, "reboot %32s", block_name) == 1)
-		op = 3;
 	else if (str[0] && str[1] && str[2] && str[3])
 		/* ascii string, but commands are not matched. */
 		return -EINVAL;
@@ -223,7 +221,6 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
  * - 0: disable RAS on the block. Take ::head as its data.
  * - 1: enable RAS on the block. Take ::head as its data.
  * - 2: inject errors on the block. Take ::inject as its data.
- * - 3: reboot on unrecoverable error
  *
  * How to use the interface?
  * programs:
@@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
 		/* data.inject.address is offset instead of absolute gpu address */
 		ret = amdgpu_ras_error_inject(adev, &data.inject);
 		break;
-	case 3:
-		amdgpu_ras_get_context(adev)->reboot = true;
-		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -346,6 +340,46 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user
 	return ret == 1 ? size : -EIO;
 }
 
+/**
+ * DOC: AMDGPU RAS debugfs auto reboot interface
+ *
+ * After one uncorrectable error happens, GPU recovery will be scheduled.
+ * Due to the known problem in GPU recovery failing to bring GPU back, this
+ * interface provides one direct way to user to reboot system automatically
+ * in such case within ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery
+ * routine will never be called.
+ *
+ * Enable auto_reboot:
+ *
+ *	echo 1 > /sys/kernel/debug/dri/x/ras/auto_reboot
+ *
+ * Revert auto_reboot:
+ *
+ * 	echo 0 > /sys/kernel/debug/dri/x/ras/auto_reboot
+ *
+ */
+static ssize_t amdgpu_ras_debugfs_reboot_write(struct file *f,
+	const char __user *buf, size_t size, loff_t *pos)
+{
+	struct amdgpu_device *adev =
+		(struct amdgpu_device *)file_inode(f)->i_private;
+	char tmp[8] = {0};
+	int value = -1;
+
+	if (size != simple_write_to_buffer(tmp, sizeof(tmp), pos, buf, size))
+		return -EINVAL;
+
+	if (kstrtoint(tmp, 10, &value))
+		return -EINVAL;
+
+	if (value == 1)
+		amdgpu_ras_get_context(adev)->reboot = true;
+	else if (value == 0)
+		amdgpu_ras_get_context(adev)->reboot = false;
+
+	return size;
+}
+
 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
 	.owner = THIS_MODULE,
 	.read = NULL,
@@ -360,6 +394,13 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
 	.llseek = default_llseek
 };
 
+static const struct file_operations amdgpu_ras_debugfs_reboot_ops = {
+	.owner = THIS_MODULE,
+	.read = NULL,
+	.write = amdgpu_ras_debugfs_reboot_write,
+	.llseek = default_llseek
+};
+
 /**
  * DOC: AMDGPU RAS sysfs Error Count Interface
  *
@@ -1037,6 +1078,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
 				adev, &amdgpu_ras_debugfs_ctrl_ops);
 	debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
 				adev, &amdgpu_ras_debugfs_eeprom_ops);
+	debugfs_create_file("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
+				adev, &amdgpu_ras_debugfs_reboot_ops);
 }
 
 void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2019-10-21 15:40 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-21  9:33 [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case Chen, Guchun
     [not found] ` <20191021093245.28945-1-guchun.chen-5C7GfCeVMHo@public.gmane.org>
2019-10-21 11:13   ` Christian König
     [not found]     ` <058c4a9d-1ae2-5721-4404-794e575b42a8-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-10-21 14:04       ` Chen, Guchun
2019-10-21 15:40   ` Deucher, Alexander
  -- strict thread matches above, loose matches on Subject: below --
2019-10-21  9:08 Chen, Guchun
     [not found] ` <20191021090735.19696-1-guchun.chen-5C7GfCeVMHo@public.gmane.org>
2019-10-21  9:11   ` Christian König
     [not found]     ` <d1558d15-a1dc-e370-1410-ebfcafd01618-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-10-21  9:19       ` Chen, Guchun
2019-10-21 14:00   ` Deucher, Alexander

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.