All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdkfd: print unmap queue status for RAS poison consumption (v3)
@ 2022-03-22  3:29 Tao Zhou
  2022-03-22  3:42 ` Paul Menzel
  0 siblings, 1 reply; 5+ messages in thread
From: Tao Zhou @ 2022-03-22  3:29 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, Felix.Kuehling, stanley.yang,
	yipeng.chai, Lijo.Lazar, pmenzel
  Cc: Tao Zhou

Print the status out when it passes, and also tell user gpu reset
is triggered when we fallback to legacy way.

v2: make the message more explicitly.
v3: change succeeds to succeeded.
    replace pr_warn with dev_info.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 56902b5bb7b6..cc9ddc4b4cb8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -105,8 +105,6 @@ static void event_interrupt_poison_consumption(struct kfd_dev *dev,
 	if (old_poison)
 		return;
 
-	pr_warn("RAS poison consumption handling: client id %d\n", client_id);
-
 	switch (client_id) {
 	case SOC15_IH_CLIENTID_SE0SH:
 	case SOC15_IH_CLIENTID_SE1SH:
@@ -130,10 +128,17 @@ static void event_interrupt_poison_consumption(struct kfd_dev *dev,
 	/* resetting queue passes, do page retirement without gpu reset
 	 * resetting queue fails, fallback to gpu reset solution
 	 */
-	if (!ret)
+	if (!ret) {
+		dev_info(dev->adev->dev,
+			"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
+			client_id);
 		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
-	else
+	} else {
+		dev_info(dev->adev->dev,
+			"RAS poison consumption, fallback to gpu reset flow: client id %d\n",
+			client_id);
 		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+	}
 }
 
 static bool event_interrupt_isr_v9(struct kfd_dev *dev,
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/amdkfd: print unmap queue status for RAS poison consumption (v3)
  2022-03-22  3:29 [PATCH] drm/amdkfd: print unmap queue status for RAS poison consumption (v3) Tao Zhou
@ 2022-03-22  3:42 ` Paul Menzel
  2022-03-22  3:51   ` Zhou1, Tao
  0 siblings, 1 reply; 5+ messages in thread
From: Paul Menzel @ 2022-03-22  3:42 UTC (permalink / raw)
  To: Tao Zhou
  Cc: Felix.Kuehling, Lijo.Lazar, amd-gfx, yipeng.chai, stanley.yang,
	hawking.zhang

Dear Tao,


Thank you for rerolling the pacth.

Am 22.03.22 um 04:29 schrieb Tao Zhou:
> Print the status out when it passes, and also tell user gpu reset
> is triggered when we fallback to legacy way.

The verb *fall back* is written with a space.

> v2: make the message more explicitly.

explicit

> v3: change succeeds to succeeded.
>      replace pr_warn with dev_info.

Please give the reason, why to decrease the debug level. (I would also 
do it in a separate commit.)

> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 13 +++++++++----
>   1 file changed, 9 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index 56902b5bb7b6..cc9ddc4b4cb8 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -105,8 +105,6 @@ static void event_interrupt_poison_consumption(struct kfd_dev *dev,
>   	if (old_poison)
>   		return;
>   
> -	pr_warn("RAS poison consumption handling: client id %d\n", client_id);
> -
>   	switch (client_id) {
>   	case SOC15_IH_CLIENTID_SE0SH:
>   	case SOC15_IH_CLIENTID_SE1SH:
> @@ -130,10 +128,17 @@ static void event_interrupt_poison_consumption(struct kfd_dev *dev,
>   	/* resetting queue passes, do page retirement without gpu reset
>   	 * resetting queue fails, fallback to gpu reset solution
>   	 */
> -	if (!ret)
> +	if (!ret) {
> +		dev_info(dev->adev->dev,
> +			"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
> +			client_id);
>   		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
> -	else
> +	} else {
> +		dev_info(dev->adev->dev,
> +			"RAS poison consumption, fallback to gpu reset flow: client id %d\n",

The verb *fall back* is written with a space.

> +			client_id);
>   		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
> +	}
>   }
>   
>   static bool event_interrupt_isr_v9(struct kfd_dev *dev,

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH] drm/amdkfd: print unmap queue status for RAS poison consumption (v3)
  2022-03-22  3:42 ` Paul Menzel
@ 2022-03-22  3:51   ` Zhou1, Tao
  2022-03-22  3:57     ` Paul Menzel
  0 siblings, 1 reply; 5+ messages in thread
From: Zhou1, Tao @ 2022-03-22  3:51 UTC (permalink / raw)
  To: Paul Menzel
  Cc: Kuehling, Felix, Lazar, Lijo, amd-gfx, Chai, Thomas, Yang,
	Stanley, Zhang, Hawking

[AMD Official Use Only]



> -----Original Message-----
> From: Paul Menzel <pmenzel@molgen.mpg.de>
> Sent: Tuesday, March 22, 2022 11:42 AM
> To: Zhou1, Tao <Tao.Zhou1@amd.com>
> Cc: amd-gfx@lists.freedesktop.org; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Yang,
> Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>;
> Lazar, Lijo <Lijo.Lazar@amd.com>
> Subject: Re: [PATCH] drm/amdkfd: print unmap queue status for RAS poison
> consumption (v3)
> 
> Dear Tao,
> 
> 
> Thank you for rerolling the pacth.
> 
> Am 22.03.22 um 04:29 schrieb Tao Zhou:
> > Print the status out when it passes, and also tell user gpu reset is
> > triggered when we fallback to legacy way.
> 
> The verb *fall back* is written with a space.

[Tao] will update it before push

> 
> > v2: make the message more explicitly.
> 
> explicit

[Tao] will update it before push.

> 
> > v3: change succeeds to succeeded.
> >      replace pr_warn with dev_info.
> 
> Please give the reason, why to decrease the debug level. (I would also do it in a
> separate commit.)

[Tao] please check my reply to Lijo. It's informational message, but ras poison consumption is special event, both debug levels are OK for me.

> 
> > Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 13 +++++++++----
> >   1 file changed, 9 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> > index 56902b5bb7b6..cc9ddc4b4cb8 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> > @@ -105,8 +105,6 @@ static void
> event_interrupt_poison_consumption(struct kfd_dev *dev,
> >   	if (old_poison)
> >   		return;
> >
> > -	pr_warn("RAS poison consumption handling: client id %d\n", client_id);
> > -
> >   	switch (client_id) {
> >   	case SOC15_IH_CLIENTID_SE0SH:
> >   	case SOC15_IH_CLIENTID_SE1SH:
> > @@ -130,10 +128,17 @@ static void
> event_interrupt_poison_consumption(struct kfd_dev *dev,
> >   	/* resetting queue passes, do page retirement without gpu reset
> >   	 * resetting queue fails, fallback to gpu reset solution
> >   	 */
> > -	if (!ret)
> > +	if (!ret) {
> > +		dev_info(dev->adev->dev,
> > +			"RAS poison consumption, unmap queue flow
> succeeded: client id %d\n",
> > +			client_id);
> >   		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev,
> false);
> > -	else
> > +	} else {
> > +		dev_info(dev->adev->dev,
> > +			"RAS poison consumption, fallback to gpu reset flow:
> client id
> > +%d\n",
> 
> The verb *fall back* is written with a space.

[Tao] will update it before push.

> 
> > +			client_id);
> >   		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev,
> true);
> > +	}
> >   }
> >
> >   static bool event_interrupt_isr_v9(struct kfd_dev *dev,

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/amdkfd: print unmap queue status for RAS poison consumption (v3)
  2022-03-22  3:51   ` Zhou1, Tao
@ 2022-03-22  3:57     ` Paul Menzel
  0 siblings, 0 replies; 5+ messages in thread
From: Paul Menzel @ 2022-03-22  3:57 UTC (permalink / raw)
  To: Tao Zhou
  Cc: Felix Kühling, Lijo Lazar, amd-gfx, Thomas Chai,
	Stanley Yang, Hawking Zhang


Dear Tao,


Am 22.03.22 um 04:51 schrieb Zhou1, Tao:
>> -----Original Message-----
>> From: Paul Menzel <pmenzel@molgen.mpg.de>
>> Sent: Tuesday, March 22, 2022 11:42 AM

[…]

>> Am 22.03.22 um 04:29 schrieb Tao Zhou:
>>> Print the status out when it passes, and also tell user gpu reset is
>>> triggered when we fallback to legacy way.
>>
>> The verb *fall back* is written with a space.
> 
> [Tao] will update it before push
> 
>>
>>> v2: make the message more explicitly.
>>
>> explicit
> 
> [Tao] will update it before push.
> 
>>
>>> v3: change succeeds to succeeded.
>>>       replace pr_warn with dev_info.
>>
>> Please give the reason, why to decrease the debug level. (I would also do it in a
>> separate commit.)
> 
> [Tao] please check my reply to Lijo. It's informational message, but
> ras poison consumption is special event, both debug levels are OK for
> me.

I saw that, but it has to be documented in the commit message. You 
cannot expect reviewers and readers of the commit to hunt down these 
discussions. Commit messages need to be self-contained.

>>> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 13 +++++++++----
>>>    1 file changed, 9 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> index 56902b5bb7b6..cc9ddc4b4cb8 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> @@ -105,8 +105,6 @@ static void
>> event_interrupt_poison_consumption(struct kfd_dev *dev,
>>>    	if (old_poison)
>>>    		return;
>>>
>>> -	pr_warn("RAS poison consumption handling: client id %d\n", client_id);
>>> -
>>>    	switch (client_id) {
>>>    	case SOC15_IH_CLIENTID_SE0SH:
>>>    	case SOC15_IH_CLIENTID_SE1SH:
>>> @@ -130,10 +128,17 @@ static void
>> event_interrupt_poison_consumption(struct kfd_dev *dev,
>>>    	/* resetting queue passes, do page retirement without gpu reset
>>>    	 * resetting queue fails, fallback to gpu reset solution
>>>    	 */
>>> -	if (!ret)
>>> +	if (!ret) {
>>> +		dev_info(dev->adev->dev,
>>> +			"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
>>> +			client_id);
>>>    		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
>>> -	else
>>> +	} else {
>>> +		dev_info(dev->adev->dev,
>>> +			"RAS poison consumption, fallback to gpu reset flow: client id
>>> +%d\n",
>>
>> The verb *fall back* is written with a space.
> 
> [Tao] will update it before push.
> 
>>
>>> +			client_id);
>>>    		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
>>> +	}
>>>    }
>>>
>>>    static bool event_interrupt_isr_v9(struct kfd_dev *dev,


Kind regards,

Paul

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH] drm/amdkfd: print unmap queue status for RAS poison consumption (v3)
@ 2022-03-22  4:06 Tao Zhou
  0 siblings, 0 replies; 5+ messages in thread
From: Tao Zhou @ 2022-03-22  4:06 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, Felix.Kuehling, stanley.yang,
	yipeng.chai, Lijo.Lazar, pmenzel
  Cc: Tao Zhou

Print the status out when it passes, and also tell user gpu reset
is triggered when we fall back to legacy way.

v2: make the message more explicit.
v3: change succeeds to succeeded.
    replace pr_warn with dev_warn.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 56902b5bb7b6..03c29bdd89a1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -105,8 +105,6 @@ static void event_interrupt_poison_consumption(struct kfd_dev *dev,
 	if (old_poison)
 		return;
 
-	pr_warn("RAS poison consumption handling: client id %d\n", client_id);
-
 	switch (client_id) {
 	case SOC15_IH_CLIENTID_SE0SH:
 	case SOC15_IH_CLIENTID_SE1SH:
@@ -130,10 +128,17 @@ static void event_interrupt_poison_consumption(struct kfd_dev *dev,
 	/* resetting queue passes, do page retirement without gpu reset
 	 * resetting queue fails, fallback to gpu reset solution
 	 */
-	if (!ret)
+	if (!ret) {
+		dev_warn(dev->adev->dev,
+			"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
+			client_id);
 		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
-	else
+	} else {
+		dev_warn(dev->adev->dev,
+			"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
+			client_id);
 		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+	}
 }
 
 static bool event_interrupt_isr_v9(struct kfd_dev *dev,
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-03-22  4:06 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-22  3:29 [PATCH] drm/amdkfd: print unmap queue status for RAS poison consumption (v3) Tao Zhou
2022-03-22  3:42 ` Paul Menzel
2022-03-22  3:51   ` Zhou1, Tao
2022-03-22  3:57     ` Paul Menzel
2022-03-22  4:06 Tao Zhou

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.