All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] drm/amdgpu: add safeguards for querying GMC CG state
@ 2022-01-28  6:54 Lang Yu
  2022-01-28  8:31 ` Lazar, Lijo
  0 siblings, 1 reply; 6+ messages in thread
From: Lang Yu @ 2022-01-28  6:54 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alex Deucher, Lang Yu, Lijo Lazar, Huang Rui, Christian Koenig

We observed a GPU hang when querying GMC CG state(i.e.,
cat amdgpu_pm_info) on cyan skillfish. Acctually, cyan
skillfish doesn't support any CG features.

Only allow ASICs which support GMC CG features accessing
related registers. As some ASICs support GMC CG but cg_flags
are not set. Use GC IP version instead of cg_flags to
determine whether GMC CG is supported or not.

v2:
 - Use a function to encapsulate more functionality.(Christian)
 - Use IP verion to determine whether CG is supported or not.(Lijo)

Signed-off-by: Lang Yu <Lang.Yu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 10 ++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  3 +++
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c   |  3 +++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  3 +++
 5 files changed, 20 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index d426de48d299..be1f03b02af6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -876,3 +876,13 @@ int amdgpu_gmc_vram_checking(struct amdgpu_device *adev)
 
 	return 0;
 }
+
+bool amdgpu_gmc_cg_enabled(struct amdgpu_device *adev)
+{
+	switch (adev->ip_versions[GC_HWIP][0]) {
+	case IP_VERSION(10, 1, 3):
+		return false;
+	default:
+		return true;
+	}
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 93505bb0a36c..b916e73c7de1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -338,4 +338,5 @@ uint64_t amdgpu_gmc_vram_mc2pa(struct amdgpu_device *adev, uint64_t mc_addr);
 uint64_t amdgpu_gmc_vram_pa(struct amdgpu_device *adev, struct amdgpu_bo *bo);
 uint64_t amdgpu_gmc_vram_cpu_pa(struct amdgpu_device *adev, struct amdgpu_bo *bo);
 int amdgpu_gmc_vram_checking(struct amdgpu_device *adev);
+bool amdgpu_gmc_cg_enabled(struct amdgpu_device *adev);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 73ab0eebe4e2..4e46f618d6c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -1156,6 +1156,9 @@ static void gmc_v10_0_get_clockgating_state(void *handle, u32 *flags)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	if (!amdgpu_gmc_cg_enabled(adev))
+		return;
+
 	adev->mmhub.funcs->get_clockgating(adev, flags);
 
 	if (adev->ip_versions[ATHUB_HWIP][0] >= IP_VERSION(2, 1, 0))
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index ca9841d5669f..ff9dff2a6cf1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -1695,6 +1695,9 @@ static void gmc_v8_0_get_clockgating_state(void *handle, u32 *flags)
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 	int data;
 
+	if (!amdgpu_gmc_cg_enabled(adev))
+		return;
+
 	if (amdgpu_sriov_vf(adev))
 		*flags = 0;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 4595027a8c63..faf017609dfe 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1952,6 +1952,9 @@ static void gmc_v9_0_get_clockgating_state(void *handle, u32 *flags)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	if (!amdgpu_gmc_cg_enabled(adev))
+		return;
+
 	adev->mmhub.funcs->get_clockgating(adev, flags);
 
 	athub_v1_0_get_clockgating(adev, flags);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] drm/amdgpu: add safeguards for querying GMC CG state
  2022-01-28  6:54 [PATCH v2] drm/amdgpu: add safeguards for querying GMC CG state Lang Yu
@ 2022-01-28  8:31 ` Lazar, Lijo
  2022-01-28  8:52   ` Lang Yu
  0 siblings, 1 reply; 6+ messages in thread
From: Lazar, Lijo @ 2022-01-28  8:31 UTC (permalink / raw)
  To: Lang Yu, amd-gfx; +Cc: Alex Deucher, Huang Rui, Christian Koenig



On 1/28/2022 12:24 PM, Lang Yu wrote:
> We observed a GPU hang when querying GMC CG state(i.e.,
> cat amdgpu_pm_info) on cyan skillfish. Acctually, cyan
> skillfish doesn't support any CG features.
> 
> Only allow ASICs which support GMC CG features accessing
> related registers. As some ASICs support GMC CG but cg_flags
> are not set. Use GC IP version instead of cg_flags to
> determine whether GMC CG is supported or not.
> 
> v2:
>   - Use a function to encapsulate more functionality.(Christian)
>   - Use IP verion to determine whether CG is supported or not.(Lijo)
> 
> Signed-off-by: Lang Yu <Lang.Yu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 10 ++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  3 +++
>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c   |  3 +++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  3 +++
>   5 files changed, 20 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index d426de48d299..be1f03b02af6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -876,3 +876,13 @@ int amdgpu_gmc_vram_checking(struct amdgpu_device *adev)
>   
>   	return 0;
>   }
> +
> +bool amdgpu_gmc_cg_enabled(struct amdgpu_device *adev)
> +{
> +	switch (adev->ip_versions[GC_HWIP][0]) {
> +	case IP_VERSION(10, 1, 3):
> +		return false;
> +	default:
> +		return true;
> +	}
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index 93505bb0a36c..b916e73c7de1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -338,4 +338,5 @@ uint64_t amdgpu_gmc_vram_mc2pa(struct amdgpu_device *adev, uint64_t mc_addr);
>   uint64_t amdgpu_gmc_vram_pa(struct amdgpu_device *adev, struct amdgpu_bo *bo);
>   uint64_t amdgpu_gmc_vram_cpu_pa(struct amdgpu_device *adev, struct amdgpu_bo *bo);
>   int amdgpu_gmc_vram_checking(struct amdgpu_device *adev);
> +bool amdgpu_gmc_cg_enabled(struct amdgpu_device *adev);
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 73ab0eebe4e2..4e46f618d6c1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -1156,6 +1156,9 @@ static void gmc_v10_0_get_clockgating_state(void *handle, u32 *flags)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> +	if (!amdgpu_gmc_cg_enabled(adev))
> +		return;
> +

I think Christian suggested amdgpu_gmc_cg_enabled function assuming it's 
a common logic for all ASICs based on flags. Now that assumption has 
changed. Now the logic is a specific IP version doesn't enable CG which 
is known beforehand. So we could maintain the check in the specific IP 
version block itself (gmc 10 in this example). No need to call another 
common function which checks IP version again.

Thanks,
Lijo

>   	adev->mmhub.funcs->get_clockgating(adev, flags);
>   
>   	if (adev->ip_versions[ATHUB_HWIP][0] >= IP_VERSION(2, 1, 0))
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> index ca9841d5669f..ff9dff2a6cf1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> @@ -1695,6 +1695,9 @@ static void gmc_v8_0_get_clockgating_state(void *handle, u32 *flags)
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   	int data;
>   
> +	if (!amdgpu_gmc_cg_enabled(adev))
> +		return;
> +
>   	if (amdgpu_sriov_vf(adev))
>   		*flags = 0;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 4595027a8c63..faf017609dfe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -1952,6 +1952,9 @@ static void gmc_v9_0_get_clockgating_state(void *handle, u32 *flags)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> +	if (!amdgpu_gmc_cg_enabled(adev))
> +		return;
> +
>   	adev->mmhub.funcs->get_clockgating(adev, flags);
>   
>   	athub_v1_0_get_clockgating(adev, flags);
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] drm/amdgpu: add safeguards for querying GMC CG state
  2022-01-28  8:31 ` Lazar, Lijo
@ 2022-01-28  8:52   ` Lang Yu
  2022-01-28  9:06     ` Lazar, Lijo
  0 siblings, 1 reply; 6+ messages in thread
From: Lang Yu @ 2022-01-28  8:52 UTC (permalink / raw)
  To: Lazar, Lijo; +Cc: Alex Deucher, Huang Rui, Christian Koenig, amd-gfx

On 01/28/ , Lazar, Lijo wrote:
> 
> 
> On 1/28/2022 12:24 PM, Lang Yu wrote:
> > We observed a GPU hang when querying GMC CG state(i.e.,
> > cat amdgpu_pm_info) on cyan skillfish. Acctually, cyan
> > skillfish doesn't support any CG features.
> > 
> > Only allow ASICs which support GMC CG features accessing
> > related registers. As some ASICs support GMC CG but cg_flags
> > are not set. Use GC IP version instead of cg_flags to
> > determine whether GMC CG is supported or not.
> > 
> > v2:
> >   - Use a function to encapsulate more functionality.(Christian)
> >   - Use IP verion to determine whether CG is supported or not.(Lijo)
> > 
> > Signed-off-by: Lang Yu <Lang.Yu@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 10 ++++++++++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 +
> >   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  3 +++
> >   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c   |  3 +++
> >   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  3 +++
> >   5 files changed, 20 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> > index d426de48d299..be1f03b02af6 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> > @@ -876,3 +876,13 @@ int amdgpu_gmc_vram_checking(struct amdgpu_device *adev)
> >   	return 0;
> >   }
> > +
> > +bool amdgpu_gmc_cg_enabled(struct amdgpu_device *adev)
> > +{
> > +	switch (adev->ip_versions[GC_HWIP][0]) {
> > +	case IP_VERSION(10, 1, 3):
> > +		return false;
> > +	default:
> > +		return true;
> > +	}
> > +}
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> > index 93505bb0a36c..b916e73c7de1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> > @@ -338,4 +338,5 @@ uint64_t amdgpu_gmc_vram_mc2pa(struct amdgpu_device *adev, uint64_t mc_addr);
> >   uint64_t amdgpu_gmc_vram_pa(struct amdgpu_device *adev, struct amdgpu_bo *bo);
> >   uint64_t amdgpu_gmc_vram_cpu_pa(struct amdgpu_device *adev, struct amdgpu_bo *bo);
> >   int amdgpu_gmc_vram_checking(struct amdgpu_device *adev);
> > +bool amdgpu_gmc_cg_enabled(struct amdgpu_device *adev);
> >   #endif
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > index 73ab0eebe4e2..4e46f618d6c1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > @@ -1156,6 +1156,9 @@ static void gmc_v10_0_get_clockgating_state(void *handle, u32 *flags)
> >   {
> >   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> > +	if (!amdgpu_gmc_cg_enabled(adev))
> > +		return;
> > +
> 
> I think Christian suggested amdgpu_gmc_cg_enabled function assuming it's a
> common logic for all ASICs based on flags. Now that assumption has changed.
> Now the logic is a specific IP version doesn't enable CG which is known
> beforehand. So we could maintain the check in the specific IP version block
> itself (gmc 10 in this example). No need to call another common function
> which checks IP version again.

Thanks. You mean just like this?

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 73ab0eebe4e2..bddaf2417344 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -1156,6 +1156,9 @@ static void gmc_v10_0_get_clockgating_state(void *handle, u32 *flags)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;

+       if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 1, 3))
+               return;
+
        adev->mmhub.funcs->get_clockgating(adev, flags);

        if (adev->ip_versions[ATHUB_HWIP][0] >= IP_VERSION(2, 1, 0))

Regards,
Lang

> Thanks,
> Lijo
> 
> >   	adev->mmhub.funcs->get_clockgating(adev, flags);
> >   	if (adev->ip_versions[ATHUB_HWIP][0] >= IP_VERSION(2, 1, 0))
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> > index ca9841d5669f..ff9dff2a6cf1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> > @@ -1695,6 +1695,9 @@ static void gmc_v8_0_get_clockgating_state(void *handle, u32 *flags)
> >   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> >   	int data;
> > +	if (!amdgpu_gmc_cg_enabled(adev))
> > +		return;
> > +
> >   	if (amdgpu_sriov_vf(adev))
> >   		*flags = 0;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > index 4595027a8c63..faf017609dfe 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > @@ -1952,6 +1952,9 @@ static void gmc_v9_0_get_clockgating_state(void *handle, u32 *flags)
> >   {
> >   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> > +	if (!amdgpu_gmc_cg_enabled(adev))
> > +		return;
> > +
> >   	adev->mmhub.funcs->get_clockgating(adev, flags);
> >   	athub_v1_0_get_clockgating(adev, flags);
> > 

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] drm/amdgpu: add safeguards for querying GMC CG state
  2022-01-28  8:52   ` Lang Yu
@ 2022-01-28  9:06     ` Lazar, Lijo
  2022-01-28  9:16       ` Lang Yu
  0 siblings, 1 reply; 6+ messages in thread
From: Lazar, Lijo @ 2022-01-28  9:06 UTC (permalink / raw)
  To: Lang Yu; +Cc: Alex Deucher, Huang Rui, Christian Koenig, amd-gfx



On 1/28/2022 2:22 PM, Lang Yu wrote:
> On 01/28/ , Lazar, Lijo wrote:
>>
>>
>> On 1/28/2022 12:24 PM, Lang Yu wrote:
>>> We observed a GPU hang when querying GMC CG state(i.e.,
>>> cat amdgpu_pm_info) on cyan skillfish. Acctually, cyan
>>> skillfish doesn't support any CG features.
>>>
>>> Only allow ASICs which support GMC CG features accessing
>>> related registers. As some ASICs support GMC CG but cg_flags
>>> are not set. Use GC IP version instead of cg_flags to
>>> determine whether GMC CG is supported or not.
>>>
>>> v2:
>>>    - Use a function to encapsulate more functionality.(Christian)
>>>    - Use IP verion to determine whether CG is supported or not.(Lijo)
>>>
>>> Signed-off-by: Lang Yu <Lang.Yu@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 10 ++++++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 +
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  3 +++
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c   |  3 +++
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  3 +++
>>>    5 files changed, 20 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>> index d426de48d299..be1f03b02af6 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>> @@ -876,3 +876,13 @@ int amdgpu_gmc_vram_checking(struct amdgpu_device *adev)
>>>    	return 0;
>>>    }
>>> +
>>> +bool amdgpu_gmc_cg_enabled(struct amdgpu_device *adev)
>>> +{
>>> +	switch (adev->ip_versions[GC_HWIP][0]) {
>>> +	case IP_VERSION(10, 1, 3):
>>> +		return false;
>>> +	default:
>>> +		return true;
>>> +	}
>>> +}
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>> index 93505bb0a36c..b916e73c7de1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>> @@ -338,4 +338,5 @@ uint64_t amdgpu_gmc_vram_mc2pa(struct amdgpu_device *adev, uint64_t mc_addr);
>>>    uint64_t amdgpu_gmc_vram_pa(struct amdgpu_device *adev, struct amdgpu_bo *bo);
>>>    uint64_t amdgpu_gmc_vram_cpu_pa(struct amdgpu_device *adev, struct amdgpu_bo *bo);
>>>    int amdgpu_gmc_vram_checking(struct amdgpu_device *adev);
>>> +bool amdgpu_gmc_cg_enabled(struct amdgpu_device *adev);
>>>    #endif
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> index 73ab0eebe4e2..4e46f618d6c1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> @@ -1156,6 +1156,9 @@ static void gmc_v10_0_get_clockgating_state(void *handle, u32 *flags)
>>>    {
>>>    	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> +	if (!amdgpu_gmc_cg_enabled(adev))
>>> +		return;
>>> +
>>
>> I think Christian suggested amdgpu_gmc_cg_enabled function assuming it's a
>> common logic for all ASICs based on flags. Now that assumption has changed.
>> Now the logic is a specific IP version doesn't enable CG which is known
>> beforehand. So we could maintain the check in the specific IP version block
>> itself (gmc 10 in this example). No need to call another common function
>> which checks IP version again.
> 
> Thanks. You mean just like this?
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 73ab0eebe4e2..bddaf2417344 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -1156,6 +1156,9 @@ static void gmc_v10_0_get_clockgating_state(void *handle, u32 *flags)
>   {
>          struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> 
> +       if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 1, 3))
		*flags = 0;

Yes, add the above line also.

Thanks,
Lijo
> +               return;
> +
>          adev->mmhub.funcs->get_clockgating(adev, flags);
> 
>          if (adev->ip_versions[ATHUB_HWIP][0] >= IP_VERSION(2, 1, 0))
> 
> Regards,
> Lang
> 
>> Thanks,
>> Lijo
>>
>>>    	adev->mmhub.funcs->get_clockgating(adev, flags);
>>>    	if (adev->ip_versions[ATHUB_HWIP][0] >= IP_VERSION(2, 1, 0))
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>> index ca9841d5669f..ff9dff2a6cf1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>> @@ -1695,6 +1695,9 @@ static void gmc_v8_0_get_clockgating_state(void *handle, u32 *flags)
>>>    	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>    	int data;
>>> +	if (!amdgpu_gmc_cg_enabled(adev))
>>> +		return;
>>> +
>>>    	if (amdgpu_sriov_vf(adev))
>>>    		*flags = 0;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index 4595027a8c63..faf017609dfe 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -1952,6 +1952,9 @@ static void gmc_v9_0_get_clockgating_state(void *handle, u32 *flags)
>>>    {
>>>    	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> +	if (!amdgpu_gmc_cg_enabled(adev))
>>> +		return;
>>> +
>>>    	adev->mmhub.funcs->get_clockgating(adev, flags);
>>>    	athub_v1_0_get_clockgating(adev, flags);
>>>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] drm/amdgpu: add safeguards for querying GMC CG state
  2022-01-28  9:06     ` Lazar, Lijo
@ 2022-01-28  9:16       ` Lang Yu
  2022-01-28  9:33         ` Lazar, Lijo
  0 siblings, 1 reply; 6+ messages in thread
From: Lang Yu @ 2022-01-28  9:16 UTC (permalink / raw)
  To: Lazar, Lijo; +Cc: Alex Deucher, Huang Rui, Christian Koenig, amd-gfx

On 01/28/ , Lazar, Lijo wrote:
> 
> 
> On 1/28/2022 2:22 PM, Lang Yu wrote:
> > On 01/28/ , Lazar, Lijo wrote:
> > > 
> > > 
> > > On 1/28/2022 12:24 PM, Lang Yu wrote:
> > > > We observed a GPU hang when querying GMC CG state(i.e.,
> > > > cat amdgpu_pm_info) on cyan skillfish. Acctually, cyan
> > > > skillfish doesn't support any CG features.
> > > > 
> > > > Only allow ASICs which support GMC CG features accessing
> > > > related registers. As some ASICs support GMC CG but cg_flags
> > > > are not set. Use GC IP version instead of cg_flags to
> > > > determine whether GMC CG is supported or not.
> > > > 
> > > > v2:
> > > >    - Use a function to encapsulate more functionality.(Christian)
> > > >    - Use IP verion to determine whether CG is supported or not.(Lijo)
> > > > 
> > > > Signed-off-by: Lang Yu <Lang.Yu@amd.com>
> > > > ---
> > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 10 ++++++++++
> > > >    drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 +
> > > >    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  3 +++
> > > >    drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c   |  3 +++
> > > >    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  3 +++
> > > >    5 files changed, 20 insertions(+)
> > > > 
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> > > > index d426de48d299..be1f03b02af6 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> > > > @@ -876,3 +876,13 @@ int amdgpu_gmc_vram_checking(struct amdgpu_device *adev)
> > > >    	return 0;
> > > >    }
> > > > +
> > > > +bool amdgpu_gmc_cg_enabled(struct amdgpu_device *adev)
> > > > +{
> > > > +	switch (adev->ip_versions[GC_HWIP][0]) {
> > > > +	case IP_VERSION(10, 1, 3):
> > > > +		return false;
> > > > +	default:
> > > > +		return true;
> > > > +	}
> > > > +}
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> > > > index 93505bb0a36c..b916e73c7de1 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> > > > @@ -338,4 +338,5 @@ uint64_t amdgpu_gmc_vram_mc2pa(struct amdgpu_device *adev, uint64_t mc_addr);
> > > >    uint64_t amdgpu_gmc_vram_pa(struct amdgpu_device *adev, struct amdgpu_bo *bo);
> > > >    uint64_t amdgpu_gmc_vram_cpu_pa(struct amdgpu_device *adev, struct amdgpu_bo *bo);
> > > >    int amdgpu_gmc_vram_checking(struct amdgpu_device *adev);
> > > > +bool amdgpu_gmc_cg_enabled(struct amdgpu_device *adev);
> > > >    #endif
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > > > index 73ab0eebe4e2..4e46f618d6c1 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > > > @@ -1156,6 +1156,9 @@ static void gmc_v10_0_get_clockgating_state(void *handle, u32 *flags)
> > > >    {
> > > >    	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> > > > +	if (!amdgpu_gmc_cg_enabled(adev))
> > > > +		return;
> > > > +
> > > 
> > > I think Christian suggested amdgpu_gmc_cg_enabled function assuming it's a
> > > common logic for all ASICs based on flags. Now that assumption has changed.
> > > Now the logic is a specific IP version doesn't enable CG which is known
> > > beforehand. So we could maintain the check in the specific IP version block
> > > itself (gmc 10 in this example). No need to call another common function
> > > which checks IP version again.
> > 
> > Thanks. You mean just like this?
> > 
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > index 73ab0eebe4e2..bddaf2417344 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > @@ -1156,6 +1156,9 @@ static void gmc_v10_0_get_clockgating_state(void *handle, u32 *flags)
> >   {
> >          struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> > 
> > +       if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 1, 3))
> 		*flags = 0;
> 
> Yes, add the above line also.

It may clear CG mask of other IP block. Does it make sense? Thanks!

Regards,
Lang

> Thanks,
> Lijo
> > +               return;
> > +
> >          adev->mmhub.funcs->get_clockgating(adev, flags);
> > 
> >          if (adev->ip_versions[ATHUB_HWIP][0] >= IP_VERSION(2, 1, 0))
> > 
> > Regards,
> > Lang
> > 
> > > Thanks,
> > > Lijo
> > > 
> > > >    	adev->mmhub.funcs->get_clockgating(adev, flags);
> > > >    	if (adev->ip_versions[ATHUB_HWIP][0] >= IP_VERSION(2, 1, 0))
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> > > > index ca9841d5669f..ff9dff2a6cf1 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> > > > @@ -1695,6 +1695,9 @@ static void gmc_v8_0_get_clockgating_state(void *handle, u32 *flags)
> > > >    	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> > > >    	int data;
> > > > +	if (!amdgpu_gmc_cg_enabled(adev))
> > > > +		return;
> > > > +
> > > >    	if (amdgpu_sriov_vf(adev))
> > > >    		*flags = 0;
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > > > index 4595027a8c63..faf017609dfe 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > > > @@ -1952,6 +1952,9 @@ static void gmc_v9_0_get_clockgating_state(void *handle, u32 *flags)
> > > >    {
> > > >    	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> > > > +	if (!amdgpu_gmc_cg_enabled(adev))
> > > > +		return;
> > > > +
> > > >    	adev->mmhub.funcs->get_clockgating(adev, flags);
> > > >    	athub_v1_0_get_clockgating(adev, flags);
> > > > 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] drm/amdgpu: add safeguards for querying GMC CG state
  2022-01-28  9:16       ` Lang Yu
@ 2022-01-28  9:33         ` Lazar, Lijo
  0 siblings, 0 replies; 6+ messages in thread
From: Lazar, Lijo @ 2022-01-28  9:33 UTC (permalink / raw)
  To: Lang Yu; +Cc: Alex Deucher, Huang Rui, Christian Koenig, amd-gfx



On 1/28/2022 2:46 PM, Lang Yu wrote:
> On 01/28/ , Lazar, Lijo wrote:
>>
>>
>> On 1/28/2022 2:22 PM, Lang Yu wrote:
>>> On 01/28/ , Lazar, Lijo wrote:
>>>>
>>>>
>>>> On 1/28/2022 12:24 PM, Lang Yu wrote:
>>>>> We observed a GPU hang when querying GMC CG state(i.e.,
>>>>> cat amdgpu_pm_info) on cyan skillfish. Acctually, cyan
>>>>> skillfish doesn't support any CG features.
>>>>>
>>>>> Only allow ASICs which support GMC CG features accessing
>>>>> related registers. As some ASICs support GMC CG but cg_flags
>>>>> are not set. Use GC IP version instead of cg_flags to
>>>>> determine whether GMC CG is supported or not.
>>>>>
>>>>> v2:
>>>>>     - Use a function to encapsulate more functionality.(Christian)
>>>>>     - Use IP verion to determine whether CG is supported or not.(Lijo)
>>>>>
>>>>> Signed-off-by: Lang Yu <Lang.Yu@amd.com>
>>>>> ---
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 10 ++++++++++
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 +
>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  3 +++
>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c   |  3 +++
>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  3 +++
>>>>>     5 files changed, 20 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>>>> index d426de48d299..be1f03b02af6 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>>>>> @@ -876,3 +876,13 @@ int amdgpu_gmc_vram_checking(struct amdgpu_device *adev)
>>>>>     	return 0;
>>>>>     }
>>>>> +
>>>>> +bool amdgpu_gmc_cg_enabled(struct amdgpu_device *adev)
>>>>> +{
>>>>> +	switch (adev->ip_versions[GC_HWIP][0]) {
>>>>> +	case IP_VERSION(10, 1, 3):
>>>>> +		return false;
>>>>> +	default:
>>>>> +		return true;
>>>>> +	}
>>>>> +}
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>>> index 93505bb0a36c..b916e73c7de1 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>>>>> @@ -338,4 +338,5 @@ uint64_t amdgpu_gmc_vram_mc2pa(struct amdgpu_device *adev, uint64_t mc_addr);
>>>>>     uint64_t amdgpu_gmc_vram_pa(struct amdgpu_device *adev, struct amdgpu_bo *bo);
>>>>>     uint64_t amdgpu_gmc_vram_cpu_pa(struct amdgpu_device *adev, struct amdgpu_bo *bo);
>>>>>     int amdgpu_gmc_vram_checking(struct amdgpu_device *adev);
>>>>> +bool amdgpu_gmc_cg_enabled(struct amdgpu_device *adev);
>>>>>     #endif
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> index 73ab0eebe4e2..4e46f618d6c1 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> @@ -1156,6 +1156,9 @@ static void gmc_v10_0_get_clockgating_state(void *handle, u32 *flags)
>>>>>     {
>>>>>     	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>> +	if (!amdgpu_gmc_cg_enabled(adev))
>>>>> +		return;
>>>>> +
>>>>
>>>> I think Christian suggested amdgpu_gmc_cg_enabled function assuming it's a
>>>> common logic for all ASICs based on flags. Now that assumption has changed.
>>>> Now the logic is a specific IP version doesn't enable CG which is known
>>>> beforehand. So we could maintain the check in the specific IP version block
>>>> itself (gmc 10 in this example). No need to call another common function
>>>> which checks IP version again.
>>>
>>> Thanks. You mean just like this?
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> index 73ab0eebe4e2..bddaf2417344 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> @@ -1156,6 +1156,9 @@ static void gmc_v10_0_get_clockgating_state(void *handle, u32 *flags)
>>>    {
>>>           struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>
>>> +       if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 1, 3))
>> 		*flags = 0;
>>
>> Yes, add the above line also.
> 
> It may clear CG mask of other IP block. Does it make sense? Thanks!
> 
Ah! right. No need to clear the flags.

Thanks,
Lijo

> Regards,
> Lang
> 
>> Thanks,
>> Lijo
>>> +               return;
>>> +
>>>           adev->mmhub.funcs->get_clockgating(adev, flags);
>>>
>>>           if (adev->ip_versions[ATHUB_HWIP][0] >= IP_VERSION(2, 1, 0))
>>>
>>> Regards,
>>> Lang
>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>>>     	adev->mmhub.funcs->get_clockgating(adev, flags);
>>>>>     	if (adev->ip_versions[ATHUB_HWIP][0] >= IP_VERSION(2, 1, 0))
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>> index ca9841d5669f..ff9dff2a6cf1 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>> @@ -1695,6 +1695,9 @@ static void gmc_v8_0_get_clockgating_state(void *handle, u32 *flags)
>>>>>     	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>>     	int data;
>>>>> +	if (!amdgpu_gmc_cg_enabled(adev))
>>>>> +		return;
>>>>> +
>>>>>     	if (amdgpu_sriov_vf(adev))
>>>>>     		*flags = 0;
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> index 4595027a8c63..faf017609dfe 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> @@ -1952,6 +1952,9 @@ static void gmc_v9_0_get_clockgating_state(void *handle, u32 *flags)
>>>>>     {
>>>>>     	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>> +	if (!amdgpu_gmc_cg_enabled(adev))
>>>>> +		return;
>>>>> +
>>>>>     	adev->mmhub.funcs->get_clockgating(adev, flags);
>>>>>     	athub_v1_0_get_clockgating(adev, flags);
>>>>>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-01-28  9:33 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-28  6:54 [PATCH v2] drm/amdgpu: add safeguards for querying GMC CG state Lang Yu
2022-01-28  8:31 ` Lazar, Lijo
2022-01-28  8:52   ` Lang Yu
2022-01-28  9:06     ` Lazar, Lijo
2022-01-28  9:16       ` Lang Yu
2022-01-28  9:33         ` Lazar, Lijo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.