All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface
@ 2022-03-29  7:38 yipechai
  2022-03-29  7:49 ` Christian König
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: yipechai @ 2022-03-29  7:38 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, Hawking.Zhang, John.Clements, yipechai, yipechai

Some AMDGPU RAS debugfs operations like UE injection
can cause gpu reset. Before doing the next debugfs
operation, the application should call poll to check
if the gpu has finished recovering.

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 ++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  6 ++++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 4bbed76b79c8..337e3e247a45 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -452,6 +452,12 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
 
 		/* data.inject.address is offset instead of absolute gpu address */
 		ret = amdgpu_ras_error_inject(adev, &data.inject);
+
+		if (!ret && (data.head.type == AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE)) {
+			struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+			con->ras_ue_injected = 1;
+		}
 		break;
 	default:
 		ret = -EINVAL;
@@ -464,6 +470,30 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
 	return size;
 }
 
+/**
+ * DOC: Support AMDGPU RAS debugfs poll interface
+ *
+ * Some AMDGPU RAS debugfs operations like UE injection
+ * can cause gpu reset. Before doing the next debugfs
+ * operation, the application should call poll to check
+ * if gpu is in recovering status.
+ */
+static __poll_t amdgpu_ras_debugfs_ctrl_poll(struct file *f, struct poll_table_struct *wait)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	__poll_t mask = 0;
+
+	/* For UE injection, wait for gpu to finish recovery */
+	if (con->ras_ue_injected)
+		poll_wait(f, &con->gpu_ready_wait_wq, wait);
+
+	if (!atomic_read(&con->in_recovery))
+		mask = EPOLLIN | EPOLLRDNORM;
+
+	return mask;
+}
+
 /**
  * DOC: AMDGPU RAS debugfs EEPROM table reset interface
  *
@@ -503,6 +533,7 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
 
 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
 	.owner = THIS_MODULE,
+	.poll = amdgpu_ras_debugfs_ctrl_poll,
 	.read = NULL,
 	.write = amdgpu_ras_debugfs_ctrl_write,
 	.llseek = default_llseek
@@ -1837,6 +1868,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
 	if (amdgpu_device_should_recover_gpu(ras->adev))
 		amdgpu_device_gpu_recover(ras->adev, NULL);
 	atomic_set(&ras->in_recovery, 0);
+
+	if (ras->ras_ue_injected) {
+		ras->ras_ue_injected = 0;
+		wake_up_all(&ras->gpu_ready_wait_wq);
+	}
 }
 
 /* alloc/realloc bps array */
@@ -2279,7 +2315,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 	INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
 	atomic_set(&con->ras_ce_count, 0);
 	atomic_set(&con->ras_ue_count, 0);
-
+	init_waitqueue_head(&con->gpu_ready_wait_wq);
 	con->objs = (struct ras_manager *)(con + 1);
 
 	amdgpu_ras_set_context(adev, con);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 606df8869b89..aea6bbb71501 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -379,6 +379,12 @@ struct amdgpu_ras {
 
 	/* Indicates smu whether need update bad channel info */
 	bool update_channel_flag;
+
+	/* UE injection flag */
+	uint32_t  ras_ue_injected;
+
+	/* Waiting for gpu ready work queue */
+	wait_queue_head_t gpu_ready_wait_wq;
 };
 
 struct ras_fs_data {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface
  2022-03-29  7:38 [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface yipechai
@ 2022-03-29  7:49 ` Christian König
  2022-03-29  8:21 ` Paul Menzel
  2022-03-29  8:33 ` Zhang, Hawking
  2 siblings, 0 replies; 6+ messages in thread
From: Christian König @ 2022-03-29  7:49 UTC (permalink / raw)
  To: yipechai, amd-gfx; +Cc: yipechai, Tao.Zhou1, John.Clements, Hawking.Zhang

Am 29.03.22 um 09:38 schrieb yipechai:
> Some AMDGPU RAS debugfs operations like UE injection
> can cause gpu reset. Before doing the next debugfs
> operation, the application should call poll to check
> if the gpu has finished recovering.

Well NAK. debugfs files are designed to be used from the command-line 
and can be opened and read/written at any time.

If we need to prevent issuing the next operation before the previous one 
has finished we need to use locks for that.

Especially if not doing so results in a hard system lockup.

Regards,
Christian.

>
> Signed-off-by: yipechai <YiPeng.Chai@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 ++++++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  6 ++++
>   2 files changed, 43 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 4bbed76b79c8..337e3e247a45 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -452,6 +452,12 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
>   
>   		/* data.inject.address is offset instead of absolute gpu address */
>   		ret = amdgpu_ras_error_inject(adev, &data.inject);
> +
> +		if (!ret && (data.head.type == AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE)) {
> +			struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +
> +			con->ras_ue_injected = 1;
> +		}
>   		break;
>   	default:
>   		ret = -EINVAL;
> @@ -464,6 +470,30 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
>   	return size;
>   }
>   
> +/**
> + * DOC: Support AMDGPU RAS debugfs poll interface
> + *
> + * Some AMDGPU RAS debugfs operations like UE injection
> + * can cause gpu reset. Before doing the next debugfs
> + * operation, the application should call poll to check
> + * if gpu is in recovering status.
> + */
> +static __poll_t amdgpu_ras_debugfs_ctrl_poll(struct file *f, struct poll_table_struct *wait)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
> +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +	__poll_t mask = 0;
> +
> +	/* For UE injection, wait for gpu to finish recovery */
> +	if (con->ras_ue_injected)
> +		poll_wait(f, &con->gpu_ready_wait_wq, wait);
> +
> +	if (!atomic_read(&con->in_recovery))
> +		mask = EPOLLIN | EPOLLRDNORM;
> +
> +	return mask;
> +}
> +
>   /**
>    * DOC: AMDGPU RAS debugfs EEPROM table reset interface
>    *
> @@ -503,6 +533,7 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
>   
>   static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
>   	.owner = THIS_MODULE,
> +	.poll = amdgpu_ras_debugfs_ctrl_poll,
>   	.read = NULL,
>   	.write = amdgpu_ras_debugfs_ctrl_write,
>   	.llseek = default_llseek
> @@ -1837,6 +1868,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
>   	if (amdgpu_device_should_recover_gpu(ras->adev))
>   		amdgpu_device_gpu_recover(ras->adev, NULL);
>   	atomic_set(&ras->in_recovery, 0);
> +
> +	if (ras->ras_ue_injected) {
> +		ras->ras_ue_injected = 0;
> +		wake_up_all(&ras->gpu_ready_wait_wq);
> +	}
>   }
>   
>   /* alloc/realloc bps array */
> @@ -2279,7 +2315,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>   	INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
>   	atomic_set(&con->ras_ce_count, 0);
>   	atomic_set(&con->ras_ue_count, 0);
> -
> +	init_waitqueue_head(&con->gpu_ready_wait_wq);
>   	con->objs = (struct ras_manager *)(con + 1);
>   
>   	amdgpu_ras_set_context(adev, con);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 606df8869b89..aea6bbb71501 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -379,6 +379,12 @@ struct amdgpu_ras {
>   
>   	/* Indicates smu whether need update bad channel info */
>   	bool update_channel_flag;
> +
> +	/* UE injection flag */
> +	uint32_t  ras_ue_injected;
> +
> +	/* Waiting for gpu ready work queue */
> +	wait_queue_head_t gpu_ready_wait_wq;
>   };
>   
>   struct ras_fs_data {


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface
  2022-03-29  7:38 [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface yipechai
  2022-03-29  7:49 ` Christian König
@ 2022-03-29  8:21 ` Paul Menzel
  2022-03-29  8:33 ` Zhang, Hawking
  2 siblings, 0 replies; 6+ messages in thread
From: Paul Menzel @ 2022-03-29  8:21 UTC (permalink / raw)
  To: YiPeng.Chai; +Cc: yipechai, Tao.Zhou1, John.Clements, amd-gfx, Hawking.Zhang

Dear Yi Peng,


Thank you for the patch. Two nits regarding the commit message.

Am 29.03.22 um 09:38 schrieb yipechai:
> Some AMDGPU RAS debugfs operations like UE injection
> can cause gpu reset. Before doing the next debugfs
> operation, the application should call poll to check
> if the gpu has finished recovering.

Please use a text width of 75 characters per line for the commit message 
body.

> Signed-off-by: yipechai <YiPeng.Chai@amd.com>

Could you please configure git (and your email program) to use your full 
name?

     $ git config --global user.name "Yi Peng Chai" # no idea if correct

[…]


Kind regards,

Paul

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface
  2022-03-29  7:38 [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface yipechai
  2022-03-29  7:49 ` Christian König
  2022-03-29  8:21 ` Paul Menzel
@ 2022-03-29  8:33 ` Zhang, Hawking
  2022-03-29 13:18   ` Chai, Thomas
  2 siblings, 1 reply; 6+ messages in thread
From: Zhang, Hawking @ 2022-03-29  8:33 UTC (permalink / raw)
  To: Chai, Thomas, amd-gfx; +Cc: Zhou1, Tao, Clements, John

[AMD Official Use Only]

I'm not sure I understand the fix correctly - It seems to me it is trying to stop user/test cases that initiate error injection request back-to-back? But anyway, we shouldn't make the change or leverage debugfs for that purpose, and there is no guarantee test scripts/applications will follow the rule as well. 

I guess we need to identify the root cause case by case and stop the invalid request in kernel driver.

Regards,
Hawking

-----Original Message-----
From: Chai, Thomas <YiPeng.Chai@amd.com> 
Sent: Tuesday, March 29, 2022 15:39
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Clements, John <John.Clements@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Subject: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface

Some AMDGPU RAS debugfs operations like UE injection can cause gpu reset. Before doing the next debugfs operation, the application should call poll to check if the gpu has finished recovering.

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 ++++++++++++++++++++++++-  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  6 ++++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 4bbed76b79c8..337e3e247a45 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -452,6 +452,12 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
 
 		/* data.inject.address is offset instead of absolute gpu address */
 		ret = amdgpu_ras_error_inject(adev, &data.inject);
+
+		if (!ret && (data.head.type == AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE)) {
+			struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+			con->ras_ue_injected = 1;
+		}
 		break;
 	default:
 		ret = -EINVAL;
@@ -464,6 +470,30 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
 	return size;
 }
 
+/**
+ * DOC: Support AMDGPU RAS debugfs poll interface
+ *
+ * Some AMDGPU RAS debugfs operations like UE injection
+ * can cause gpu reset. Before doing the next debugfs
+ * operation, the application should call poll to check
+ * if gpu is in recovering status.
+ */
+static __poll_t amdgpu_ras_debugfs_ctrl_poll(struct file *f, struct 
+poll_table_struct *wait) {
+	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	__poll_t mask = 0;
+
+	/* For UE injection, wait for gpu to finish recovery */
+	if (con->ras_ue_injected)
+		poll_wait(f, &con->gpu_ready_wait_wq, wait);
+
+	if (!atomic_read(&con->in_recovery))
+		mask = EPOLLIN | EPOLLRDNORM;
+
+	return mask;
+}
+
 /**
  * DOC: AMDGPU RAS debugfs EEPROM table reset interface
  *
@@ -503,6 +533,7 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
 
 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
 	.owner = THIS_MODULE,
+	.poll = amdgpu_ras_debugfs_ctrl_poll,
 	.read = NULL,
 	.write = amdgpu_ras_debugfs_ctrl_write,
 	.llseek = default_llseek
@@ -1837,6 +1868,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
 	if (amdgpu_device_should_recover_gpu(ras->adev))
 		amdgpu_device_gpu_recover(ras->adev, NULL);
 	atomic_set(&ras->in_recovery, 0);
+
+	if (ras->ras_ue_injected) {
+		ras->ras_ue_injected = 0;
+		wake_up_all(&ras->gpu_ready_wait_wq);
+	}
 }
 
 /* alloc/realloc bps array */
@@ -2279,7 +2315,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 	INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
 	atomic_set(&con->ras_ce_count, 0);
 	atomic_set(&con->ras_ue_count, 0);
-
+	init_waitqueue_head(&con->gpu_ready_wait_wq);
 	con->objs = (struct ras_manager *)(con + 1);
 
 	amdgpu_ras_set_context(adev, con);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 606df8869b89..aea6bbb71501 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -379,6 +379,12 @@ struct amdgpu_ras {
 
 	/* Indicates smu whether need update bad channel info */
 	bool update_channel_flag;
+
+	/* UE injection flag */
+	uint32_t  ras_ue_injected;
+
+	/* Waiting for gpu ready work queue */
+	wait_queue_head_t gpu_ready_wait_wq;
 };
 
 struct ras_fs_data {
--
2.25.1

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* RE: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface
  2022-03-29  8:33 ` Zhang, Hawking
@ 2022-03-29 13:18   ` Chai, Thomas
  2022-03-29 13:20     ` Christian König
  0 siblings, 1 reply; 6+ messages in thread
From: Chai, Thomas @ 2022-03-29 13:18 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx; +Cc: Zhou1, Tao, Clements, John

[AMD Official Use Only]

Sorry for the confusing commit message。 
This interface is only for amdgpu ras tool new function. There is no impact on currently existing tools and scripts。

-----Original Message-----
From: Zhang, Hawking <Hawking.Zhang@amd.com> 
Sent: Tuesday, March 29, 2022 4:33 PM
To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>; Clements, John <John.Clements@amd.com>
Subject: RE: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface

[AMD Official Use Only]

I'm not sure I understand the fix correctly - It seems to me it is trying to stop user/test cases that initiate error injection request back-to-back? But anyway, we shouldn't make the change or leverage debugfs for that purpose, and there is no guarantee test scripts/applications will follow the rule as well. 

I guess we need to identify the root cause case by case and stop the invalid request in kernel driver.

Regards,
Hawking

-----Original Message-----
From: Chai, Thomas <YiPeng.Chai@amd.com>
Sent: Tuesday, March 29, 2022 15:39
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Clements, John <John.Clements@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Subject: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface

Some AMDGPU RAS debugfs operations like UE injection can cause gpu reset. Before doing the next debugfs operation, the application should call poll to check if the gpu has finished recovering.

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 ++++++++++++++++++++++++-  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  6 ++++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 4bbed76b79c8..337e3e247a45 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -452,6 +452,12 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
 
 		/* data.inject.address is offset instead of absolute gpu address */
 		ret = amdgpu_ras_error_inject(adev, &data.inject);
+
+		if (!ret && (data.head.type == AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE)) {
+			struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+			con->ras_ue_injected = 1;
+		}
 		break;
 	default:
 		ret = -EINVAL;
@@ -464,6 +470,30 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
 	return size;
 }
 
+/**
+ * DOC: Support AMDGPU RAS debugfs poll interface
+ *
+ * Some AMDGPU RAS debugfs operations like UE injection
+ * can cause gpu reset. Before doing the next debugfs
+ * operation, the application should call poll to check
+ * if gpu is in recovering status.
+ */
+static __poll_t amdgpu_ras_debugfs_ctrl_poll(struct file *f, struct 
+poll_table_struct *wait) {
+	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	__poll_t mask = 0;
+
+	/* For UE injection, wait for gpu to finish recovery */
+	if (con->ras_ue_injected)
+		poll_wait(f, &con->gpu_ready_wait_wq, wait);
+
+	if (!atomic_read(&con->in_recovery))
+		mask = EPOLLIN | EPOLLRDNORM;
+
+	return mask;
+}
+
 /**
  * DOC: AMDGPU RAS debugfs EEPROM table reset interface
  *
@@ -503,6 +533,7 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
 
 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
 	.owner = THIS_MODULE,
+	.poll = amdgpu_ras_debugfs_ctrl_poll,
 	.read = NULL,
 	.write = amdgpu_ras_debugfs_ctrl_write,
 	.llseek = default_llseek
@@ -1837,6 +1868,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
 	if (amdgpu_device_should_recover_gpu(ras->adev))
 		amdgpu_device_gpu_recover(ras->adev, NULL);
 	atomic_set(&ras->in_recovery, 0);
+
+	if (ras->ras_ue_injected) {
+		ras->ras_ue_injected = 0;
+		wake_up_all(&ras->gpu_ready_wait_wq);
+	}
 }
 
 /* alloc/realloc bps array */
@@ -2279,7 +2315,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 	INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
 	atomic_set(&con->ras_ce_count, 0);
 	atomic_set(&con->ras_ue_count, 0);
-
+	init_waitqueue_head(&con->gpu_ready_wait_wq);
 	con->objs = (struct ras_manager *)(con + 1);
 
 	amdgpu_ras_set_context(adev, con);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 606df8869b89..aea6bbb71501 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -379,6 +379,12 @@ struct amdgpu_ras {
 
 	/* Indicates smu whether need update bad channel info */
 	bool update_channel_flag;
+
+	/* UE injection flag */
+	uint32_t  ras_ue_injected;
+
+	/* Waiting for gpu ready work queue */
+	wait_queue_head_t gpu_ready_wait_wq;
 };
 
 struct ras_fs_data {
--
2.25.1

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface
  2022-03-29 13:18   ` Chai, Thomas
@ 2022-03-29 13:20     ` Christian König
  0 siblings, 0 replies; 6+ messages in thread
From: Christian König @ 2022-03-29 13:20 UTC (permalink / raw)
  To: Chai, Thomas, Zhang, Hawking, amd-gfx; +Cc: Zhou1, Tao, Clements, John

Well that is a really bad idea.

Tools should only use sysfs functionality, not debugfs since that isn't 
stable.

And I totally agree to Hawking here. As I wrote on the other mail this 
patch is a no-go.

Regards,
Christian.

Am 29.03.22 um 15:18 schrieb Chai, Thomas:
> [AMD Official Use Only]
>
> Sorry for the confusing commit message。
> This interface is only for amdgpu ras tool new function. There is no impact on currently existing tools and scripts。
>
> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang@amd.com>
> Sent: Tuesday, March 29, 2022 4:33 PM
> To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>; Clements, John <John.Clements@amd.com>
> Subject: RE: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface
>
> [AMD Official Use Only]
>
> I'm not sure I understand the fix correctly - It seems to me it is trying to stop user/test cases that initiate error injection request back-to-back? But anyway, we shouldn't make the change or leverage debugfs for that purpose, and there is no guarantee test scripts/applications will follow the rule as well.
>
> I guess we need to identify the root cause case by case and stop the invalid request in kernel driver.
>
> Regards,
> Hawking
>
> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Tuesday, March 29, 2022 15:39
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Clements, John <John.Clements@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
> Subject: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface
>
> Some AMDGPU RAS debugfs operations like UE injection can cause gpu reset. Before doing the next debugfs operation, the application should call poll to check if the gpu has finished recovering.
>
> Signed-off-by: yipechai <YiPeng.Chai@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 ++++++++++++++++++++++++-  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  6 ++++
>   2 files changed, 43 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 4bbed76b79c8..337e3e247a45 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -452,6 +452,12 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
>   
>   		/* data.inject.address is offset instead of absolute gpu address */
>   		ret = amdgpu_ras_error_inject(adev, &data.inject);
> +
> +		if (!ret && (data.head.type == AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE)) {
> +			struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +
> +			con->ras_ue_injected = 1;
> +		}
>   		break;
>   	default:
>   		ret = -EINVAL;
> @@ -464,6 +470,30 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
>   	return size;
>   }
>   
> +/**
> + * DOC: Support AMDGPU RAS debugfs poll interface
> + *
> + * Some AMDGPU RAS debugfs operations like UE injection
> + * can cause gpu reset. Before doing the next debugfs
> + * operation, the application should call poll to check
> + * if gpu is in recovering status.
> + */
> +static __poll_t amdgpu_ras_debugfs_ctrl_poll(struct file *f, struct
> +poll_table_struct *wait) {
> +	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
> +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +	__poll_t mask = 0;
> +
> +	/* For UE injection, wait for gpu to finish recovery */
> +	if (con->ras_ue_injected)
> +		poll_wait(f, &con->gpu_ready_wait_wq, wait);
> +
> +	if (!atomic_read(&con->in_recovery))
> +		mask = EPOLLIN | EPOLLRDNORM;
> +
> +	return mask;
> +}
> +
>   /**
>    * DOC: AMDGPU RAS debugfs EEPROM table reset interface
>    *
> @@ -503,6 +533,7 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
>   
>   static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
>   	.owner = THIS_MODULE,
> +	.poll = amdgpu_ras_debugfs_ctrl_poll,
>   	.read = NULL,
>   	.write = amdgpu_ras_debugfs_ctrl_write,
>   	.llseek = default_llseek
> @@ -1837,6 +1868,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
>   	if (amdgpu_device_should_recover_gpu(ras->adev))
>   		amdgpu_device_gpu_recover(ras->adev, NULL);
>   	atomic_set(&ras->in_recovery, 0);
> +
> +	if (ras->ras_ue_injected) {
> +		ras->ras_ue_injected = 0;
> +		wake_up_all(&ras->gpu_ready_wait_wq);
> +	}
>   }
>   
>   /* alloc/realloc bps array */
> @@ -2279,7 +2315,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>   	INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
>   	atomic_set(&con->ras_ce_count, 0);
>   	atomic_set(&con->ras_ue_count, 0);
> -
> +	init_waitqueue_head(&con->gpu_ready_wait_wq);
>   	con->objs = (struct ras_manager *)(con + 1);
>   
>   	amdgpu_ras_set_context(adev, con);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 606df8869b89..aea6bbb71501 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -379,6 +379,12 @@ struct amdgpu_ras {
>   
>   	/* Indicates smu whether need update bad channel info */
>   	bool update_channel_flag;
> +
> +	/* UE injection flag */
> +	uint32_t  ras_ue_injected;
> +
> +	/* Waiting for gpu ready work queue */
> +	wait_queue_head_t gpu_ready_wait_wq;
>   };
>   
>   struct ras_fs_data {
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-03-29 13:21 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-29  7:38 [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface yipechai
2022-03-29  7:49 ` Christian König
2022-03-29  8:21 ` Paul Menzel
2022-03-29  8:33 ` Zhang, Hawking
2022-03-29 13:18   ` Chai, Thomas
2022-03-29 13:20     ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.