All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init
@ 2022-01-20  3:18 yipechai
  2022-01-20  3:18 ` [PATCH V2 2/2] Revert "drm/amdgpu: No longer insert ras blocks into ras_list if it already exists in ras_list" yipechai
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: yipechai @ 2022-01-20  3:18 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, Hawking.Zhang, John.Clements, yipechai, yipechai

Move xgmi ras initialization from .late_init to .early_init, which let
xgmi ras can be initialized only once.

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 15 ++++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  5 +++++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  5 +++++
 4 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 3483a82f5734..788c0257832d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -436,6 +436,16 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
 	} while (fault->timestamp < tmp);
 }
 
+int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev)
+{
+	if (!adev->gmc.xgmi.connected_to_cpu) {
+		adev->gmc.xgmi.ras = &xgmi_ras;
+		amdgpu_ras_register_ras_block(adev, &adev->gmc.xgmi.ras->ras_block);
+	}
+
+	return 0;
+}
+
 int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
 {
 	int r;
@@ -452,11 +462,6 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
 			return r;
 	}
 
-	if (!adev->gmc.xgmi.connected_to_cpu) {
-		adev->gmc.xgmi.ras = &xgmi_ras;
-		amdgpu_ras_register_ras_block(adev, &adev->gmc.xgmi.ras->ras_block);
-	}
-
 	if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ras_late_init) {
 		r = adev->gmc.xgmi.ras->ras_block.ras_late_init(adev, NULL);
 		if (r)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 0001631cfedb..ac4c0e50b45c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -318,6 +318,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
 			      uint16_t pasid, uint64_t timestamp);
 void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
 				     uint16_t pasid);
+int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev);
 int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
 void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
 int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 4f8d356f8432..7a6ad5d467b2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -719,6 +719,7 @@ static void gmc_v10_0_set_gfxhub_funcs(struct amdgpu_device *adev)
 
 static int gmc_v10_0_early_init(void *handle)
 {
+	int r;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
 	gmc_v10_0_set_mmhub_funcs(adev);
@@ -734,6 +735,10 @@ static int gmc_v10_0_early_init(void *handle)
 	adev->gmc.private_aperture_end =
 		adev->gmc.private_aperture_start + (4ULL << 30) - 1;
 
+	r = amdgpu_gmc_ras_early_init(adev);
+	if (r)
+		return r;
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index c76ffd1a70cd..3cdd3d459d51 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1318,6 +1318,7 @@ static void gmc_v9_0_set_mca_funcs(struct amdgpu_device *adev)
 
 static int gmc_v9_0_early_init(void *handle)
 {
+	int r;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
 	/* ARCT and VEGA20 don't have XGMI defined in their IP discovery tables */
@@ -1347,6 +1348,10 @@ static int gmc_v9_0_early_init(void *handle)
 	adev->gmc.private_aperture_end =
 		adev->gmc.private_aperture_start + (4ULL << 30) - 1;
 
+	r = amdgpu_gmc_ras_early_init(adev);
+	if (r)
+		return r;
+
 	return 0;
 }
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH V2 2/2] Revert "drm/amdgpu: No longer insert ras blocks into ras_list if it already exists in ras_list"
  2022-01-20  3:18 [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init yipechai
@ 2022-01-20  3:18 ` yipechai
  2022-01-20  3:39 ` [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init Zhou1, Tao
  2022-01-20  5:49 ` Lazar, Lijo
  2 siblings, 0 replies; 7+ messages in thread
From: yipechai @ 2022-01-20  3:18 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, Hawking.Zhang, John.Clements, yipechai, yipechai

This reverts commit 48e175f7476c6deb7ccf1f10d081322d52830a17.

Xgmi ras initialization had been moved from .late_init to early_init,
the defect of repeated calling amdgpu_ras_register_ras_block had been
fixed, so revert this patch.

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7a1d2bac698e..c92383fe7834 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2765,19 +2765,12 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
 		struct amdgpu_ras_block_object *ras_block_obj)
 {
-	struct amdgpu_ras_block_object *obj, *tmp;
 	if (!adev || !ras_block_obj)
 		return -EINVAL;
 
 	if (!amdgpu_ras_asic_supported(adev))
 		return 0;
 
-	/* If the ras object is in ras_list, don't add it again */
-	list_for_each_entry_safe(obj, tmp, &adev->ras_list, node) {
-		if (obj == ras_block_obj)
-			return 0;
-	}
-
 	INIT_LIST_HEAD(&ras_block_obj->node);
 	list_add_tail(&ras_block_obj->node, &adev->ras_list);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* RE: [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init
  2022-01-20  3:18 [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init yipechai
  2022-01-20  3:18 ` [PATCH V2 2/2] Revert "drm/amdgpu: No longer insert ras blocks into ras_list if it already exists in ras_list" yipechai
@ 2022-01-20  3:39 ` Zhou1, Tao
  2022-01-20  5:49 ` Lazar, Lijo
  2 siblings, 0 replies; 7+ messages in thread
From: Zhou1, Tao @ 2022-01-20  3:39 UTC (permalink / raw)
  To: Chai, Thomas, amd-gfx; +Cc: Clements, John, Zhang, Hawking

[AMD Official Use Only]

The series is:

Reviewed-by: Tao Zhou <tao.zhou1@amd.com>

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Thursday, January 20, 2022 11:19 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Clements,
> John <John.Clements@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
> Subject: [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization
> from .late_init to .early_init
> 
> Move xgmi ras initialization from .late_init to .early_init, which let xgmi ras can
> be initialized only once.
> 
> Signed-off-by: yipechai <YiPeng.Chai@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 15 ++++++++++-----
> drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 +
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  5 +++++
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  5 +++++
>  4 files changed, 21 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 3483a82f5734..788c0257832d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -436,6 +436,16 @@ void amdgpu_gmc_filter_faults_remove(struct
> amdgpu_device *adev, uint64_t addr,
>  	} while (fault->timestamp < tmp);
>  }
> 
> +int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev) {
> +	if (!adev->gmc.xgmi.connected_to_cpu) {
> +		adev->gmc.xgmi.ras = &xgmi_ras;
> +		amdgpu_ras_register_ras_block(adev, &adev->gmc.xgmi.ras-
> >ras_block);
> +	}
> +
> +	return 0;
> +}
> +
>  int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)  {
>  	int r;
> @@ -452,11 +462,6 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device
> *adev)
>  			return r;
>  	}
> 
> -	if (!adev->gmc.xgmi.connected_to_cpu) {
> -		adev->gmc.xgmi.ras = &xgmi_ras;
> -		amdgpu_ras_register_ras_block(adev, &adev->gmc.xgmi.ras-
> >ras_block);
> -	}
> -
>  	if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ras_late_init)
> {
>  		r = adev->gmc.xgmi.ras->ras_block.ras_late_init(adev, NULL);
>  		if (r)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index 0001631cfedb..ac4c0e50b45c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -318,6 +318,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device
> *adev,
>  			      uint16_t pasid, uint64_t timestamp);  void
> amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
>  				     uint16_t pasid);
> +int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev);
>  int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);  void
> amdgpu_gmc_ras_fini(struct amdgpu_device *adev);  int
> amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev); diff --git
> a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 4f8d356f8432..7a6ad5d467b2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -719,6 +719,7 @@ static void gmc_v10_0_set_gfxhub_funcs(struct
> amdgpu_device *adev)
> 
>  static int gmc_v10_0_early_init(void *handle)  {
> +	int r;
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> 
>  	gmc_v10_0_set_mmhub_funcs(adev);
> @@ -734,6 +735,10 @@ static int gmc_v10_0_early_init(void *handle)
>  	adev->gmc.private_aperture_end =
>  		adev->gmc.private_aperture_start + (4ULL << 30) - 1;
> 
> +	r = amdgpu_gmc_ras_early_init(adev);
> +	if (r)
> +		return r;
> +
>  	return 0;
>  }
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index c76ffd1a70cd..3cdd3d459d51 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -1318,6 +1318,7 @@ static void gmc_v9_0_set_mca_funcs(struct
> amdgpu_device *adev)
> 
>  static int gmc_v9_0_early_init(void *handle)  {
> +	int r;
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> 
>  	/* ARCT and VEGA20 don't have XGMI defined in their IP discovery
> tables */ @@ -1347,6 +1348,10 @@ static int gmc_v9_0_early_init(void
> *handle)
>  	adev->gmc.private_aperture_end =
>  		adev->gmc.private_aperture_start + (4ULL << 30) - 1;
> 
> +	r = amdgpu_gmc_ras_early_init(adev);
> +	if (r)
> +		return r;
> +
>  	return 0;
>  }
> 
> --
> 2.25.1

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init
  2022-01-20  3:18 [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init yipechai
  2022-01-20  3:18 ` [PATCH V2 2/2] Revert "drm/amdgpu: No longer insert ras blocks into ras_list if it already exists in ras_list" yipechai
  2022-01-20  3:39 ` [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init Zhou1, Tao
@ 2022-01-20  5:49 ` Lazar, Lijo
  2022-01-20  7:27   ` Chai, Thomas
  2 siblings, 1 reply; 7+ messages in thread
From: Lazar, Lijo @ 2022-01-20  5:49 UTC (permalink / raw)
  To: yipechai, amd-gfx; +Cc: yipechai, Tao.Zhou1, John.Clements, Hawking.Zhang



On 1/20/2022 8:48 AM, yipechai wrote:
> Move xgmi ras initialization from .late_init to .early_init, which let
> xgmi ras can be initialized only once.
> 
> Signed-off-by: yipechai <YiPeng.Chai@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 15 ++++++++++-----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  5 +++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  5 +++++
>   4 files changed, 21 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 3483a82f5734..788c0257832d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -436,6 +436,16 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
>   	} while (fault->timestamp < tmp);
>   }
>   
> +int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev)
> +{
> +	if (!adev->gmc.xgmi.connected_to_cpu) {
> +		adev->gmc.xgmi.ras = &xgmi_ras;
> +		amdgpu_ras_register_ras_block(adev, &adev->gmc.xgmi.ras->ras_block);
> +	}
> +
> +	return 0;
> +}
> +
>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>   {
>   	int r;
> @@ -452,11 +462,6 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>   			return r;
>   	}
>   
> -	if (!adev->gmc.xgmi.connected_to_cpu) {
> -		adev->gmc.xgmi.ras = &xgmi_ras;
> -		amdgpu_ras_register_ras_block(adev, &adev->gmc.xgmi.ras->ras_block);
> -	}
> -
>   	if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ras_late_init) {
>   		r = adev->gmc.xgmi.ras->ras_block.ras_late_init(adev, NULL);
>   		if (r)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index 0001631cfedb..ac4c0e50b45c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -318,6 +318,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
>   			      uint16_t pasid, uint64_t timestamp);
>   void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
>   				     uint16_t pasid);
> +int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev);
>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
>   void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
>   int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 4f8d356f8432..7a6ad5d467b2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -719,6 +719,7 @@ static void gmc_v10_0_set_gfxhub_funcs(struct amdgpu_device *adev)
>   
>   static int gmc_v10_0_early_init(void *handle)
>   {
> +	int r;
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
>   	gmc_v10_0_set_mmhub_funcs(adev);
> @@ -734,6 +735,10 @@ static int gmc_v10_0_early_init(void *handle)
>   	adev->gmc.private_aperture_end =
>   		adev->gmc.private_aperture_start + (4ULL << 30) - 1;
>   
> +	r = amdgpu_gmc_ras_early_init(adev);
> +	if (r)
> +		return r;
> +

At this point it's unknown if RAS is applicable for the SKU. I think 
this failure check shouldn't be there (here and below one).

amdgpu_gmc_ras_early_init is return 0 always, that way also this check 
is not needed.

Thanks,
Lijo

>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index c76ffd1a70cd..3cdd3d459d51 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -1318,6 +1318,7 @@ static void gmc_v9_0_set_mca_funcs(struct amdgpu_device *adev)
>   
>   static int gmc_v9_0_early_init(void *handle)
>   {
> +	int r;
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
>   	/* ARCT and VEGA20 don't have XGMI defined in their IP discovery tables */
> @@ -1347,6 +1348,10 @@ static int gmc_v9_0_early_init(void *handle)
>   	adev->gmc.private_aperture_end =
>   		adev->gmc.private_aperture_start + (4ULL << 30) - 1;
>   
> +	r = amdgpu_gmc_ras_early_init(adev);
> +	if (r)
> +		return r;
> +
>   	return 0;
>   }
>   
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init
  2022-01-20  5:49 ` Lazar, Lijo
@ 2022-01-20  7:27   ` Chai, Thomas
  2022-01-20  7:31     ` Lazar, Lijo
  0 siblings, 1 reply; 7+ messages in thread
From: Chai, Thomas @ 2022-01-20  7:27 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx; +Cc: Zhou1, Tao, Clements, John, Zhang, Hawking


-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar@amd.com> 
Sent: Thursday, January 20, 2022 1:49 PM
To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Clements, John <John.Clements@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Subject: Re: [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init



On 1/20/2022 8:48 AM, yipechai wrote:
> Move xgmi ras initialization from .late_init to .early_init, which let 
> xgmi ras can be initialized only once.
> 
> Signed-off-by: yipechai <YiPeng.Chai@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 15 ++++++++++-----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  5 +++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  5 +++++
>   4 files changed, 21 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 3483a82f5734..788c0257832d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -436,6 +436,16 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
>   	} while (fault->timestamp < tmp);
>   }
>   
> +int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev) {
> +	if (!adev->gmc.xgmi.connected_to_cpu) {
> +		adev->gmc.xgmi.ras = &xgmi_ras;
> +		amdgpu_ras_register_ras_block(adev, &adev->gmc.xgmi.ras->ras_block);
> +	}
> +
> +	return 0;
> +}
> +
>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>   {
>   	int r;
> @@ -452,11 +462,6 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>   			return r;
>   	}
>   
> -	if (!adev->gmc.xgmi.connected_to_cpu) {
> -		adev->gmc.xgmi.ras = &xgmi_ras;
> -		amdgpu_ras_register_ras_block(adev, &adev->gmc.xgmi.ras->ras_block);
> -	}
> -
>   	if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ras_late_init) {
>   		r = adev->gmc.xgmi.ras->ras_block.ras_late_init(adev, NULL);
>   		if (r)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index 0001631cfedb..ac4c0e50b45c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -318,6 +318,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
>   			      uint16_t pasid, uint64_t timestamp);
>   void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
>   				     uint16_t pasid);
> +int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev);
>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
>   void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
>   int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev); diff 
> --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 4f8d356f8432..7a6ad5d467b2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -719,6 +719,7 @@ static void gmc_v10_0_set_gfxhub_funcs(struct 
> amdgpu_device *adev)
>   
>   static int gmc_v10_0_early_init(void *handle)
>   {
> +	int r;
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
>   	gmc_v10_0_set_mmhub_funcs(adev);
> @@ -734,6 +735,10 @@ static int gmc_v10_0_early_init(void *handle)
>   	adev->gmc.private_aperture_end =
>   		adev->gmc.private_aperture_start + (4ULL << 30) - 1;
>   
> +	r = amdgpu_gmc_ras_early_init(adev);
> +	if (r)
> +		return r;
> +

>At this point it's unknown if RAS is applicable for the SKU. I think this failure check shouldn't be there (here and below one).

>amdgpu_gmc_ras_early_init is return 0 always, that way also this check is not needed.

[Thomas]  Just like calling amdgpu_gmc_ras_late_init,  checking the return status may make the code extensible.  
	   In amdgpu_gmc_ras_early_init,  the xgmi ras initialization may always return 0, but it may add functions that need to check the return status in future.

Thanks,
Lijo

>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index c76ffd1a70cd..3cdd3d459d51 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -1318,6 +1318,7 @@ static void gmc_v9_0_set_mca_funcs(struct 
> amdgpu_device *adev)
>   
>   static int gmc_v9_0_early_init(void *handle)
>   {
> +	int r;
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
>   	/* ARCT and VEGA20 don't have XGMI defined in their IP discovery 
> tables */ @@ -1347,6 +1348,10 @@ static int gmc_v9_0_early_init(void *handle)
>   	adev->gmc.private_aperture_end =
>   		adev->gmc.private_aperture_start + (4ULL << 30) - 1;
>   
> +	r = amdgpu_gmc_ras_early_init(adev);
> +	if (r)
> +		return r;
> +
>   	return 0;
>   }
>   
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init
  2022-01-20  7:27   ` Chai, Thomas
@ 2022-01-20  7:31     ` Lazar, Lijo
  2022-01-20  8:57       ` Chai, Thomas
  0 siblings, 1 reply; 7+ messages in thread
From: Lazar, Lijo @ 2022-01-20  7:31 UTC (permalink / raw)
  To: Chai, Thomas, amd-gfx; +Cc: Zhou1, Tao, Clements, John, Zhang, Hawking



On 1/20/2022 12:57 PM, Chai, Thomas wrote:
> 
> -----Original Message-----
> From: Lazar, Lijo <Lijo.Lazar@amd.com>
> Sent: Thursday, January 20, 2022 1:49 PM
> To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Clements, John <John.Clements@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
> Subject: Re: [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init
> 
> 
> 
> On 1/20/2022 8:48 AM, yipechai wrote:
>> Move xgmi ras initialization from .late_init to .early_init, which let
>> xgmi ras can be initialized only once.
>>
>> Signed-off-by: yipechai <YiPeng.Chai@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 15 ++++++++++-----
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 +
>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  5 +++++
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  5 +++++
>>    4 files changed, 21 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> index 3483a82f5734..788c0257832d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> @@ -436,6 +436,16 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
>>    	} while (fault->timestamp < tmp);
>>    }
>>    
>> +int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev) {
>> +	if (!adev->gmc.xgmi.connected_to_cpu) {
>> +		adev->gmc.xgmi.ras = &xgmi_ras;
>> +		amdgpu_ras_register_ras_block(adev, &adev->gmc.xgmi.ras->ras_block);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>    int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>>    {
>>    	int r;
>> @@ -452,11 +462,6 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>>    			return r;
>>    	}
>>    
>> -	if (!adev->gmc.xgmi.connected_to_cpu) {
>> -		adev->gmc.xgmi.ras = &xgmi_ras;
>> -		amdgpu_ras_register_ras_block(adev, &adev->gmc.xgmi.ras->ras_block);
>> -	}
>> -
>>    	if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ras_late_init) {
>>    		r = adev->gmc.xgmi.ras->ras_block.ras_late_init(adev, NULL);
>>    		if (r)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> index 0001631cfedb..ac4c0e50b45c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> @@ -318,6 +318,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
>>    			      uint16_t pasid, uint64_t timestamp);
>>    void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
>>    				     uint16_t pasid);
>> +int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev);
>>    int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
>>    void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
>>    int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev); diff
>> --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 4f8d356f8432..7a6ad5d467b2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -719,6 +719,7 @@ static void gmc_v10_0_set_gfxhub_funcs(struct
>> amdgpu_device *adev)
>>    
>>    static int gmc_v10_0_early_init(void *handle)
>>    {
>> +	int r;
>>    	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>    
>>    	gmc_v10_0_set_mmhub_funcs(adev);
>> @@ -734,6 +735,10 @@ static int gmc_v10_0_early_init(void *handle)
>>    	adev->gmc.private_aperture_end =
>>    		adev->gmc.private_aperture_start + (4ULL << 30) - 1;
>>    
>> +	r = amdgpu_gmc_ras_early_init(adev);
>> +	if (r)
>> +		return r;
>> +
> 
>> At this point it's unknown if RAS is applicable for the SKU. I think this failure check shouldn't be there (here and below one).
> 
>> amdgpu_gmc_ras_early_init is return 0 always, that way also this check is not needed.
> 
> [Thomas]  Just like calling amdgpu_gmc_ras_late_init,  checking the return status may make the code extensible.
> 	   In amdgpu_gmc_ras_early_init,  the xgmi ras initialization may always return 0, but it may add functions that need to check the return status in future.
> 

At this point, it's unknown

1) If the device is part of XGMI hive or not.
2) If the device supports RAS.

For such devices, it doesn't make any sense to fail here based on this 
function.

> Thanks,
> Lijo
> 
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index c76ffd1a70cd..3cdd3d459d51 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -1318,6 +1318,7 @@ static void gmc_v9_0_set_mca_funcs(struct
>> amdgpu_device *adev)
>>    
>>    static int gmc_v9_0_early_init(void *handle)
>>    {
>> +	int r;
>>    	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>    
>>    	/* ARCT and VEGA20 don't have XGMI defined in their IP discovery
>> tables */ @@ -1347,6 +1348,10 @@ static int gmc_v9_0_early_init(void *handle)
>>    	adev->gmc.private_aperture_end =
>>    		adev->gmc.private_aperture_start + (4ULL << 30) - 1;
>>    
>> +	r = amdgpu_gmc_ras_early_init(adev);
>> +	if (r)
>> +		return r;
>> +
>>    	return 0;
>>    }
>>    
>>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init
  2022-01-20  7:31     ` Lazar, Lijo
@ 2022-01-20  8:57       ` Chai, Thomas
  0 siblings, 0 replies; 7+ messages in thread
From: Chai, Thomas @ 2022-01-20  8:57 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx; +Cc: Zhou1, Tao, Clements, John, Zhang, Hawking



-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar@amd.com> 
Sent: Thursday, January 20, 2022 3:32 PM
To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Clements, John <John.Clements@amd.com>
Subject: Re: [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init



On 1/20/2022 12:57 PM, Chai, Thomas wrote:
> 
> -----Original Message-----
> From: Lazar, Lijo <Lijo.Lazar@amd.com>
> Sent: Thursday, January 20, 2022 1:49 PM
> To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>; Zhang, Hawking 
> <Hawking.Zhang@amd.com>; Clements, John <John.Clements@amd.com>; Chai, 
> Thomas <YiPeng.Chai@amd.com>
> Subject: Re: [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization 
> from .late_init to .early_init
> 
> 
> 
> On 1/20/2022 8:48 AM, yipechai wrote:
>> Move xgmi ras initialization from .late_init to .early_init, which 
>> let xgmi ras can be initialized only once.
>>
>> Signed-off-by: yipechai <YiPeng.Chai@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 15 ++++++++++-----
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 +
>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  5 +++++
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  5 +++++
>>    4 files changed, 21 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> index 3483a82f5734..788c0257832d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> @@ -436,6 +436,16 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
>>    	} while (fault->timestamp < tmp);
>>    }
>>    
>> +int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev) {
>> +	if (!adev->gmc.xgmi.connected_to_cpu) {
>> +		adev->gmc.xgmi.ras = &xgmi_ras;
>> +		amdgpu_ras_register_ras_block(adev, &adev->gmc.xgmi.ras->ras_block);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>    int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>>    {
>>    	int r;
>> @@ -452,11 +462,6 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>>    			return r;
>>    	}
>>    
>> -	if (!adev->gmc.xgmi.connected_to_cpu) {
>> -		adev->gmc.xgmi.ras = &xgmi_ras;
>> -		amdgpu_ras_register_ras_block(adev, &adev->gmc.xgmi.ras->ras_block);
>> -	}
>> -
>>    	if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ras_late_init) {
>>    		r = adev->gmc.xgmi.ras->ras_block.ras_late_init(adev, NULL);
>>    		if (r)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> index 0001631cfedb..ac4c0e50b45c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> @@ -318,6 +318,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
>>    			      uint16_t pasid, uint64_t timestamp);
>>    void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
>>    				     uint16_t pasid);
>> +int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev);
>>    int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
>>    void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
>>    int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev); 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 4f8d356f8432..7a6ad5d467b2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -719,6 +719,7 @@ static void gmc_v10_0_set_gfxhub_funcs(struct 
>> amdgpu_device *adev)
>>    
>>    static int gmc_v10_0_early_init(void *handle)
>>    {
>> +	int r;
>>    	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>    
>>    	gmc_v10_0_set_mmhub_funcs(adev);
>> @@ -734,6 +735,10 @@ static int gmc_v10_0_early_init(void *handle)
>>    	adev->gmc.private_aperture_end =
>>    		adev->gmc.private_aperture_start + (4ULL << 30) - 1;
>>    
>> +	r = amdgpu_gmc_ras_early_init(adev);
>> +	if (r)
>> +		return r;
>> +
> 
>> At this point it's unknown if RAS is applicable for the SKU. I think this failure check shouldn't be there (here and below one).
> 
>> amdgpu_gmc_ras_early_init is return 0 always, that way also this check is not needed.
> 
> [Thomas]  Just like calling amdgpu_gmc_ras_late_init,  checking the return status may make the code extensible.
> 	   In amdgpu_gmc_ras_early_init,  the xgmi ras initialization may always return 0, but it may add functions that need to check the return status in future.
> 

>At this point, it's unknown

>1) If the device is part of XGMI hive or not.
>2) If the device supports RAS.

>For such devices, it doesn't make any sense to fail here based on this function.

[Thomas] The current code in amdgpu_gmc_ras_early_init has no effect for the devices,  whether the device has xgmi hive or not, whether it support RAS or not.
                  Checking the return status is just for amdgpu_gmc_ras_early_init to make it easy to add future functions that need to check the return status.

> Thanks,
> Lijo
> 
>>    	return 0;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index c76ffd1a70cd..3cdd3d459d51 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -1318,6 +1318,7 @@ static void gmc_v9_0_set_mca_funcs(struct 
>> amdgpu_device *adev)
>>    
>>    static int gmc_v9_0_early_init(void *handle)
>>    {
>> +	int r;
>>    	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>    
>>    	/* ARCT and VEGA20 don't have XGMI defined in their IP discovery 
>> tables */ @@ -1347,6 +1348,10 @@ static int gmc_v9_0_early_init(void *handle)
>>    	adev->gmc.private_aperture_end =
>>    		adev->gmc.private_aperture_start + (4ULL << 30) - 1;
>>    
>> +	r = amdgpu_gmc_ras_early_init(adev);
>> +	if (r)
>> +		return r;
>> +
>>    	return 0;
>>    }
>>    
>>

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-01-20  8:57 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-20  3:18 [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init yipechai
2022-01-20  3:18 ` [PATCH V2 2/2] Revert "drm/amdgpu: No longer insert ras blocks into ras_list if it already exists in ras_list" yipechai
2022-01-20  3:39 ` [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init Zhou1, Tao
2022-01-20  5:49 ` Lazar, Lijo
2022-01-20  7:27   ` Chai, Thomas
2022-01-20  7:31     ` Lazar, Lijo
2022-01-20  8:57       ` Chai, Thomas

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.