amd-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/amdgpu: enable RAS IH for poison consumption
@ 2022-05-10  6:29 Tao Zhou
  2022-05-10  6:29 ` [PATCH 2/2] drm/amdgpu: refine RAS poison consumption handler Tao Zhou
  2022-05-10  6:36 ` [PATCH 1/2] drm/amdgpu: enable RAS IH for poison consumption Ziya, Mohammad zafar
  0 siblings, 2 replies; 4+ messages in thread
From: Tao Zhou @ 2022-05-10  6:29 UTC (permalink / raw)
  To: amd-gfx, Mohammadzafar.Ziya, Lijo.Lazar, hawking.zhang,
	stanley.yang, YiPeng.Chai
  Cc: Tao Zhou

Enable RAS IH if poison consumption handler is implemented.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index cac56f830aed..91d9e9969b4e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2516,7 +2516,9 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
 		return 0;
 
 	ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
-	if (ras_obj->ras_cb) {
+	if (ras_obj->ras_cb || (ras_obj->hw_ops &&
+	    (ras_obj->hw_ops->query_poison_status ||
+	    ras_obj->hw_ops->handle_poison_consumption))) {
 		r = amdgpu_ras_interrupt_add_handler(adev, ras_block);
 		if (r)
 			goto cleanup;
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/2] drm/amdgpu: refine RAS poison consumption handler
  2022-05-10  6:29 [PATCH 1/2] drm/amdgpu: enable RAS IH for poison consumption Tao Zhou
@ 2022-05-10  6:29 ` Tao Zhou
  2022-05-10 10:14   ` Lazar, Lijo
  2022-05-10  6:36 ` [PATCH 1/2] drm/amdgpu: enable RAS IH for poison consumption Ziya, Mohammad zafar
  1 sibling, 1 reply; 4+ messages in thread
From: Tao Zhou @ 2022-05-10  6:29 UTC (permalink / raw)
  To: amd-gfx, Mohammadzafar.Ziya, Lijo.Lazar, hawking.zhang,
	stanley.yang, YiPeng.Chai
  Cc: Tao Zhou

Qeury ras status before ras poison consumption handling, add more
comment and log.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 +++++++++++++++----------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 91d9e9969b4e..a653cf3b3d13 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1538,33 +1538,42 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
 static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
 				struct amdgpu_iv_entry *entry)
 {
-	bool poison_stat = true, need_reset = true;
+	bool poison_stat = false;
 	struct amdgpu_device *adev = obj->adev;
 	struct ras_err_data err_data = {0, 0, 0, NULL};
 	struct amdgpu_ras_block_object *block_obj =
 		amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
 
-	if (!adev->gmc.xgmi.connected_to_cpu)
-		amdgpu_umc_poison_handler(adev, &err_data, false);
-
-	/* both query_poison_status and handle_poison_consumption are optional */
-	if (block_obj && block_obj->hw_ops) {
-		if (block_obj->hw_ops->query_poison_status) {
-			poison_stat = block_obj->hw_ops->query_poison_status(adev);
-			if (!poison_stat)
-				dev_info(adev->dev, "No RAS poison status in %s poison IH.\n",
-						block_obj->ras_comm.name);
-		}
+	if (!block_obj || !block_obj->hw_ops)
+		return;
 
-		if (poison_stat && block_obj->hw_ops->handle_poison_consumption) {
-			poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
-			need_reset = poison_stat;
+	/* both query_poison_status and handle_poison_consumption are optional,
+	 * but at least one of them should be implemented if we need poison
+	 * consumption handler
+	 */
+	if (block_obj->hw_ops->query_poison_status) {
+		poison_stat = block_obj->hw_ops->query_poison_status(adev);
+		if (!poison_stat) {
+			/* Not poison consumption interrupt, no need to handle it */
+			dev_info(adev->dev, "No RAS poison status in %s poison IH.\n",
+					block_obj->ras_comm.name);
+
+			return;
 		}
 	}
 
-	/* gpu reset is fallback for all failed cases */
-	if (need_reset)
+	if (!adev->gmc.xgmi.connected_to_cpu)
+		amdgpu_umc_poison_handler(adev, &err_data, false);
+
+	if (block_obj->hw_ops->handle_poison_consumption)
+		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
+
+	/* gpu reset is fallback for failed and default cases */
+	if (poison_stat) {
+		dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
+				block_obj->ras_comm.name);
 		amdgpu_ras_reset_gpu(adev);
+	}
 }
 
 static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* RE: [PATCH 1/2] drm/amdgpu: enable RAS IH for poison consumption
  2022-05-10  6:29 [PATCH 1/2] drm/amdgpu: enable RAS IH for poison consumption Tao Zhou
  2022-05-10  6:29 ` [PATCH 2/2] drm/amdgpu: refine RAS poison consumption handler Tao Zhou
@ 2022-05-10  6:36 ` Ziya, Mohammad zafar
  1 sibling, 0 replies; 4+ messages in thread
From: Ziya, Mohammad zafar @ 2022-05-10  6:36 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Lazar, Lijo, Zhang, Hawking, Yang, Stanley,
	Chai, Thomas
  Cc: Zhou1, Tao

[AMD Official Use Only - General]


Reviewed-by: Mohammad Zafar Ziya <Mohammadzafar.ziya@amd.com>

>-----Original Message-----
>From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tao
>Zhou
>Sent: Tuesday, May 10, 2022 12:00 PM
>To: amd-gfx@lists.freedesktop.org; Ziya, Mohammad zafar
><Mohammadzafar.Ziya@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>;
>Zhang, Hawking <Hawking.Zhang@amd.com>; Yang, Stanley
><Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
>Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
>Subject: [PATCH 1/2] drm/amdgpu: enable RAS IH for poison consumption
>
>Enable RAS IH if poison consumption handler is implemented.
>
>Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 +++-
> 1 file changed, 3 insertions(+), 1 deletion(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>index cac56f830aed..91d9e9969b4e 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>@@ -2516,7 +2516,9 @@ int amdgpu_ras_block_late_init(struct
>amdgpu_device *adev,
> 		return 0;
>
> 	ras_obj = container_of(ras_block, struct amdgpu_ras_block_object,
>ras_comm);
>-	if (ras_obj->ras_cb) {
>+	if (ras_obj->ras_cb || (ras_obj->hw_ops &&
>+	    (ras_obj->hw_ops->query_poison_status ||
>+	    ras_obj->hw_ops->handle_poison_consumption))) {
> 		r = amdgpu_ras_interrupt_add_handler(adev, ras_block);
> 		if (r)
> 			goto cleanup;
>--
>2.35.1

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: refine RAS poison consumption handler
  2022-05-10  6:29 ` [PATCH 2/2] drm/amdgpu: refine RAS poison consumption handler Tao Zhou
@ 2022-05-10 10:14   ` Lazar, Lijo
  0 siblings, 0 replies; 4+ messages in thread
From: Lazar, Lijo @ 2022-05-10 10:14 UTC (permalink / raw)
  To: Tao Zhou, amd-gfx, Mohammadzafar.Ziya, hawking.zhang,
	stanley.yang, YiPeng.Chai



On 5/10/2022 11:59 AM, Tao Zhou wrote:
> Qeury ras status before ras poison consumption handling, add more
> comment and log.
> 
> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>

Series is :
	Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>

Thanks,
Lijo

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 +++++++++++++++----------
>   1 file changed, 26 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 91d9e9969b4e..a653cf3b3d13 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1538,33 +1538,42 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
>   static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
>   				struct amdgpu_iv_entry *entry)
>   {
> -	bool poison_stat = true, need_reset = true;
> +	bool poison_stat = false;
>   	struct amdgpu_device *adev = obj->adev;
>   	struct ras_err_data err_data = {0, 0, 0, NULL};
>   	struct amdgpu_ras_block_object *block_obj =
>   		amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
>   
> -	if (!adev->gmc.xgmi.connected_to_cpu)
> -		amdgpu_umc_poison_handler(adev, &err_data, false);
> -
> -	/* both query_poison_status and handle_poison_consumption are optional */
> -	if (block_obj && block_obj->hw_ops) {
> -		if (block_obj->hw_ops->query_poison_status) {
> -			poison_stat = block_obj->hw_ops->query_poison_status(adev);
> -			if (!poison_stat)
> -				dev_info(adev->dev, "No RAS poison status in %s poison IH.\n",
> -						block_obj->ras_comm.name);
> -		}
> +	if (!block_obj || !block_obj->hw_ops)
> +		return;
>   
> -		if (poison_stat && block_obj->hw_ops->handle_poison_consumption) {
> -			poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
> -			need_reset = poison_stat;
> +	/* both query_poison_status and handle_poison_consumption are optional,
> +	 * but at least one of them should be implemented if we need poison
> +	 * consumption handler
> +	 */
> +	if (block_obj->hw_ops->query_poison_status) {
> +		poison_stat = block_obj->hw_ops->query_poison_status(adev);
> +		if (!poison_stat) {
> +			/* Not poison consumption interrupt, no need to handle it */
> +			dev_info(adev->dev, "No RAS poison status in %s poison IH.\n",
> +					block_obj->ras_comm.name);
> +
> +			return;
>   		}
>   	}
>   
> -	/* gpu reset is fallback for all failed cases */
> -	if (need_reset)
> +	if (!adev->gmc.xgmi.connected_to_cpu)
> +		amdgpu_umc_poison_handler(adev, &err_data, false);
> +
> +	if (block_obj->hw_ops->handle_poison_consumption)
> +		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
> +
> +	/* gpu reset is fallback for failed and default cases */
> +	if (poison_stat) {
> +		dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
> +				block_obj->ras_comm.name);
>   		amdgpu_ras_reset_gpu(adev);
> +	}
>   }
>   
>   static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-05-10 10:15 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-10  6:29 [PATCH 1/2] drm/amdgpu: enable RAS IH for poison consumption Tao Zhou
2022-05-10  6:29 ` [PATCH 2/2] drm/amdgpu: refine RAS poison consumption handler Tao Zhou
2022-05-10 10:14   ` Lazar, Lijo
2022-05-10  6:36 ` [PATCH 1/2] drm/amdgpu: enable RAS IH for poison consumption Ziya, Mohammad zafar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).