All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras block
@ 2021-12-01 10:52 yipechai
  2021-12-01 10:52 ` [PATCH V2 02/11] drm/amdgpu: Modify the compilation failed problem when other ras blocks' .h include amdgpu_ras.h yipechai
                   ` (10 more replies)
  0 siblings, 11 replies; 20+ messages in thread
From: yipechai @ 2021-12-01 10:52 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, Hawking.Zhang, yipechai, yipechai

1. Define unified ops interface for each block.
2. Add ras_block_match function pointer in ops interface for each ras block to identify itself.
3. Define unified basic ras block data for each ras block.
4. Create dedicated amdgpu device ras block link list to manage all of the ras blocks.
5. Add amdgpu_ras_register_ras_block new function interface for each ras block to register itself to ras controlling block.

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 12 +++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    | 29 ++++++++++++++++++++++
 4 files changed, 45 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index db1505455761..eddf230856e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1151,6 +1151,8 @@ struct amdgpu_device {
 	bool				barrier_has_auto_waitcnt;
 
 	struct amdgpu_reset_control     *reset_cntl;
+
+	struct list_head		ras_list;
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 73ec46140d68..0980396ee709 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3578,6 +3578,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 
 	INIT_LIST_HEAD(&adev->reset_list);
 
+	INIT_LIST_HEAD(&adev->ras_list);
+
 	INIT_DELAYED_WORK(&adev->delayed_init_work,
 			  amdgpu_device_delayed_init_work_handler);
 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 90f0db3b4f65..8713575c7cf1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2739,3 +2739,15 @@ static void amdgpu_register_bad_pages_mca_notifier(void)
         }
 }
 #endif
+/* Rigister each ip ras block into amdgpu ras */
+int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
+		struct amdgpu_ras_block_object* ras_block_obj)
+{
+	if (!adev || !ras_block_obj)
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&ras_block_obj->node);
+	list_add_tail(&ras_block_obj->node, &adev->ras_list);
+
+	return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index cdd0010a5389..d6e5e3c862bd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -469,6 +469,34 @@ struct ras_debug_if {
 	};
 	int op;
 };
+
+struct amdgpu_ras_block_object {
+	/* block name */
+	char name[32];
+
+	enum amdgpu_ras_block block;
+
+	uint32_t sub_block_index;
+
+	/* ras block link */
+	struct list_head node;
+
+	const struct amdgpu_ras_block_ops *ops;
+};
+
+struct amdgpu_ras_block_ops {
+	int (*ras_block_match)(struct amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index);
+	int (*ras_late_init)(struct amdgpu_device *adev);
+	void (*ras_fini)(struct amdgpu_device *adev);
+	int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if);
+	void  (*query_ras_error_count)(struct amdgpu_device *adev,void *ras_error_status);
+	void (*query_ras_error_status)(struct amdgpu_device *adev);
+	bool  (*query_ras_poison_mode)(struct amdgpu_device *adev);
+	void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status);
+	void (*reset_ras_error_count)(struct amdgpu_device *adev);
+	void (*reset_ras_error_status)(struct amdgpu_device *adev);
+};
+
 /* work flow
  * vbios
  * 1: ras feature enable (enabled by default)
@@ -652,4 +680,5 @@ const char *get_ras_block_str(struct ras_common_if *ras_block);
 
 bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev);
 
+int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct amdgpu_ras_block_object* ras_block_obj);
 #endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH V2 02/11] drm/amdgpu: Modify the compilation failed problem when other ras blocks' .h include amdgpu_ras.h
  2021-12-01 10:52 [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras block yipechai
@ 2021-12-01 10:52 ` yipechai
  2021-12-06  6:56   ` Zhou1, Tao
  2021-12-01 10:52 ` [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the unified ras block data and ops yipechai
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 20+ messages in thread
From: yipechai @ 2021-12-01 10:52 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, Hawking.Zhang, yipechai, yipechai

Modify the compilation failed problem when other ras blocks' .h include amdgpu_ras.h.

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 ++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 23 ++++-------------------
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8713575c7cf1..1cf1f6331db1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2739,6 +2739,28 @@ static void amdgpu_register_bad_pages_mca_notifier(void)
         }
 }
 #endif
+
+/* check if ras is supported on block, say, sdma, gfx */
+int amdgpu_ras_is_supported(struct amdgpu_device *adev,
+		unsigned int block)
+{
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+	if (block >= AMDGPU_RAS_BLOCK_COUNT)
+		return 0;
+	return ras && (adev->ras_enabled & (1 << block));
+}
+
+int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
+		schedule_work(&ras->recovery_work);
+	return 0;
+}
+
+
 /* Rigister each ip ras block into amdgpu ras */
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
 		struct amdgpu_ras_block_object* ras_block_obj)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index d6e5e3c862bd..41623a649fa1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -514,16 +514,6 @@ struct amdgpu_ras_block_ops {
 #define amdgpu_ras_get_context(adev)		((adev)->psp.ras_context.ras)
 #define amdgpu_ras_set_context(adev, ras_con)	((adev)->psp.ras_context.ras = (ras_con))
 
-/* check if ras is supported on block, say, sdma, gfx */
-static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
-		unsigned int block)
-{
-	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-
-	if (block >= AMDGPU_RAS_BLOCK_COUNT)
-		return 0;
-	return ras && (adev->ras_enabled & (1 << block));
-}
 
 int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
 
@@ -540,15 +530,6 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 
 int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
 
-static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
-{
-	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-
-	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
-		schedule_work(&ras->recovery_work);
-	return 0;
-}
-
 static inline enum ta_ras_block
 amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
 	switch (block) {
@@ -680,5 +661,9 @@ const char *get_ras_block_str(struct ras_common_if *ras_block);
 
 bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev);
 
+int amdgpu_ras_is_supported(struct amdgpu_device *adev,	unsigned int block);
+
+int amdgpu_ras_reset_gpu(struct amdgpu_device *adev);
+
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct amdgpu_ras_block_object* ras_block_obj);
 #endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the unified ras block data and ops
  2021-12-01 10:52 [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras block yipechai
  2021-12-01 10:52 ` [PATCH V2 02/11] drm/amdgpu: Modify the compilation failed problem when other ras blocks' .h include amdgpu_ras.h yipechai
@ 2021-12-01 10:52 ` yipechai
  2021-12-06  6:58   ` Zhou1, Tao
  2021-12-01 10:52 ` [PATCH V2 04/11] drm/amdgpu: Modify gmc " yipechai
                   ` (8 subsequent siblings)
  10 siblings, 1 reply; 20+ messages in thread
From: yipechai @ 2021-12-01 10:52 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, Hawking.Zhang, yipechai, yipechai

1.Modify gfx block to fit for the unified ras block data and ops
2.Implement .ras_block_match function pointer for gfx block to identify itself.
3.Change amdgpu_gfx_ras_funcs to amdgpu_gfx_ras, and the corresponding variable name remove _funcs suffix.
4.Remove the const flag of gfx ras variable so that gfx ras block can be able to be insertted into amdgpu device ras block link list.
5.Invoke amdgpu_ras_register_ras_block function to register gfx ras block into amdgpu device ras block link list.
6.Remove the redundant code about gfx in amdgpu_ras.c after using the unified ras block.

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 15 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 80 ++++++++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 73 +++++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c   | 39 ++++++++----
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 42 +++++++++----
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h |  2 +-
 8 files changed, 178 insertions(+), 81 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 1795d448c700..da8691259ac1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -696,9 +696,9 @@ int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
 	 */
 	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
 		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
-		if (adev->gfx.ras_funcs &&
-		    adev->gfx.ras_funcs->query_ras_error_count)
-			adev->gfx.ras_funcs->query_ras_error_count(adev, err_data);
+		if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
+		    adev->gfx.ras->ras_block.ops->query_ras_error_count)
+			adev->gfx.ras->ras_block.ops->query_ras_error_count(adev, err_data);
 		amdgpu_ras_reset_gpu(adev);
 	}
 	return AMDGPU_RAS_SUCCESS;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 6b78b4a0e182..ff4a8428a84b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -31,6 +31,7 @@
 #include "amdgpu_ring.h"
 #include "amdgpu_rlc.h"
 #include "soc15.h"
+#include "amdgpu_ras.h"
 
 /* GFX current status */
 #define AMDGPU_GFX_NORMAL_MODE			0x00000000L
@@ -213,16 +214,8 @@ struct amdgpu_cu_info {
 	uint32_t bitmap[4][4];
 };
 
-struct amdgpu_gfx_ras_funcs {
-	int (*ras_late_init)(struct amdgpu_device *adev);
-	void (*ras_fini)(struct amdgpu_device *adev);
-	int (*ras_error_inject)(struct amdgpu_device *adev,
-				void *inject_if);
-	int (*query_ras_error_count)(struct amdgpu_device *adev,
-				     void *ras_error_status);
-	void (*reset_ras_error_count)(struct amdgpu_device *adev);
-	void (*query_ras_error_status)(struct amdgpu_device *adev);
-	void (*reset_ras_error_status)(struct amdgpu_device *adev);
+struct amdgpu_gfx_ras {
+	struct amdgpu_ras_block_object  ras_block;
 	void (*enable_watchdog_timer)(struct amdgpu_device *adev);
 };
 
@@ -348,7 +341,7 @@ struct amdgpu_gfx {
 
 	/*ras */
 	struct ras_common_if			*ras_if;
-	const struct amdgpu_gfx_ras_funcs	*ras_funcs;
+	struct amdgpu_gfx_ras	*ras;
 };
 
 #define amdgpu_gfx_get_gpu_clock_counter(adev) (adev)->gfx.funcs->get_gpu_clock_counter((adev))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 1cf1f6331db1..190a4a4e9d7a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -862,6 +862,27 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
 }
 /* feature ctl end */
 
+static struct amdgpu_ras_block_object* amdgpu_ras_get_ras_block(struct amdgpu_device *adev,
+					enum amdgpu_ras_block block, uint32_t sub_block_index)
+{
+	struct amdgpu_ras_block_object *obj, *tmp;
+
+	if (block >= AMDGPU_RAS_BLOCK__LAST) {
+		return NULL;
+	}
+
+	list_for_each_entry_safe(obj, tmp, &adev->ras_list, node) {
+		if( !obj->ops || !obj->ops->ras_block_match) {
+			dev_info(adev->dev, "%s don't config ops or  ras_block_match\n", obj->name);
+			continue;
+		}
+		if (!obj->ops->ras_block_match(obj, block, sub_block_index)) {
+			return obj;
+		}
+	}
+
+	return NULL;
+}
 
 void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
 				       struct ras_common_if *ras_block,
@@ -892,6 +913,7 @@ void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
 int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 				  struct ras_query_if *info)
 {
+	struct amdgpu_ras_block_object* block_obj = NULL;
 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
 	struct ras_err_data err_data = {0, 0, 0, NULL};
 	int i;
@@ -899,6 +921,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 	if (!obj)
 		return -EINVAL;
 
+	block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
+
 	switch (info->head.block) {
 	case AMDGPU_RAS_BLOCK__UMC:
 		if (adev->umc.ras_funcs &&
@@ -919,13 +943,17 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 		}
 		break;
 	case AMDGPU_RAS_BLOCK__GFX:
-		if (adev->gfx.ras_funcs &&
-		    adev->gfx.ras_funcs->query_ras_error_count)
-			adev->gfx.ras_funcs->query_ras_error_count(adev, &err_data);
+		if (!block_obj || !block_obj->ops)	{
+			dev_info(adev->dev, "%s don't config ras function \n",
+				get_ras_block_str(&info->head));
+			return -EINVAL;
+		}
+
+		if (block_obj->ops->query_ras_error_count)
+			block_obj->ops->query_ras_error_count(adev, &err_data);
 
-		if (adev->gfx.ras_funcs &&
-		    adev->gfx.ras_funcs->query_ras_error_status)
-			adev->gfx.ras_funcs->query_ras_error_status(adev);
+		if (block_obj->ops->query_ras_error_status)
+			block_obj->ops->query_ras_error_status(adev);
 		break;
 	case AMDGPU_RAS_BLOCK__MMHUB:
 		if (adev->mmhub.ras_funcs &&
@@ -1012,18 +1040,21 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
 		enum amdgpu_ras_block block)
 {
+	struct amdgpu_ras_block_object* block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
 	if (!amdgpu_ras_is_supported(adev, block))
 		return -EINVAL;
 
 	switch (block) {
 	case AMDGPU_RAS_BLOCK__GFX:
-		if (adev->gfx.ras_funcs &&
-		    adev->gfx.ras_funcs->reset_ras_error_count)
-			adev->gfx.ras_funcs->reset_ras_error_count(adev);
+		if (!block_obj || !block_obj->ops)	{
+			dev_info(adev->dev, "%s don't config ras function \n", ras_block_str(block));
+			return -EINVAL;
+		}
+		if (block_obj->ops->reset_ras_error_count)
+			block_obj->ops->reset_ras_error_count(adev);
 
-		if (adev->gfx.ras_funcs &&
-		    adev->gfx.ras_funcs->reset_ras_error_status)
-			adev->gfx.ras_funcs->reset_ras_error_status(adev);
+		if (block_obj->ops->reset_ras_error_status)
+			block_obj->ops->reset_ras_error_status(adev);
 		break;
 	case AMDGPU_RAS_BLOCK__MMHUB:
 		if (adev->mmhub.ras_funcs &&
@@ -1088,7 +1119,8 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 		.address = info->address,
 		.value = info->value,
 	};
-	int ret = 0;
+	int ret = -EINVAL;
+	struct amdgpu_ras_block_object* block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, info->head.sub_block_index);
 
 	if (!obj)
 		return -EINVAL;
@@ -1102,11 +1134,12 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 
 	switch (info->head.block) {
 	case AMDGPU_RAS_BLOCK__GFX:
-		if (adev->gfx.ras_funcs &&
-		    adev->gfx.ras_funcs->ras_error_inject)
-			ret = adev->gfx.ras_funcs->ras_error_inject(adev, info);
-		else
-			ret = -EINVAL;
+		if (!block_obj || !block_obj->ops)	{
+			dev_info(adev->dev, "%s don't config ras function \n", get_ras_block_str(&info->head));
+			return -EINVAL;
+		}
+		if (block_obj->ops->ras_error_inject)
+			ret = block_obj->ops->ras_error_inject(adev, info);
 		break;
 	case AMDGPU_RAS_BLOCK__UMC:
 	case AMDGPU_RAS_BLOCK__SDMA:
@@ -1727,15 +1760,20 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
 static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
 					  struct ras_query_if *info)
 {
+	struct amdgpu_ras_block_object* block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, info->head.sub_block_index);
 	/*
 	 * Only two block need to query read/write
 	 * RspStatus at current state
 	 */
 	switch (info->head.block) {
 	case AMDGPU_RAS_BLOCK__GFX:
-		if (adev->gfx.ras_funcs &&
-		    adev->gfx.ras_funcs->query_ras_error_status)
-			adev->gfx.ras_funcs->query_ras_error_status(adev);
+		if (!block_obj || !block_obj->ops)	{
+			dev_info(adev->dev, "%s don't config ras function \n", get_ras_block_str(&info->head));
+			return ;
+		}
+
+		if (block_obj->ops->query_ras_error_status)
+			block_obj->ops->query_ras_error_status(adev);
 		break;
 	case AMDGPU_RAS_BLOCK__MMHUB:
 		if (adev->mmhub.ras_funcs &&
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 08e91e7245df..2ffde223c4f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -817,7 +817,7 @@ static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
 static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev);
 static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring);
 static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
-static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
+static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
 					  void *ras_error_status);
 static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
 				     void *inject_if);
@@ -2118,6 +2118,18 @@ static void gfx_v9_0_select_me_pipe_q(struct amdgpu_device *adev,
 	soc15_grbm_select(adev, me, pipe, q, vm);
 }
 
+static int gfx_v9_0_ras_block_match(struct amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)
+{
+	if(!block_obj)
+		return -EINVAL;
+
+	if(block_obj->block == block) {
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
 static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
         .get_gpu_clock_counter = &gfx_v9_0_get_gpu_clock_counter,
         .select_se_sh = &gfx_v9_0_select_se_sh,
@@ -2127,12 +2139,21 @@ static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
         .select_me_pipe_q = &gfx_v9_0_select_me_pipe_q,
 };
 
-static const struct amdgpu_gfx_ras_funcs gfx_v9_0_ras_funcs = {
-	.ras_late_init = amdgpu_gfx_ras_late_init,
-	.ras_fini = amdgpu_gfx_ras_fini,
-	.ras_error_inject = &gfx_v9_0_ras_error_inject,
-	.query_ras_error_count = &gfx_v9_0_query_ras_error_count,
-	.reset_ras_error_count = &gfx_v9_0_reset_ras_error_count,
+const struct amdgpu_ras_block_ops  gfx_v9_0_ras_ops = {
+		.ras_block_match = gfx_v9_0_ras_block_match,
+		.ras_late_init = amdgpu_gfx_ras_late_init,
+		.ras_fini = amdgpu_gfx_ras_fini,
+		.ras_error_inject = &gfx_v9_0_ras_error_inject,
+		.query_ras_error_count = &gfx_v9_0_query_ras_error_count,
+		.reset_ras_error_count = &gfx_v9_0_reset_ras_error_count,
+};
+
+static struct amdgpu_gfx_ras gfx_v9_0_ras = {
+	.ras_block = {
+		.name = "gfx",
+		.block = AMDGPU_RAS_BLOCK__GFX,
+		.ops = &gfx_v9_0_ras_ops,
+	},
 };
 
 static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
@@ -2161,7 +2182,7 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
 		DRM_INFO("fix gfx.config for vega12\n");
 		break;
 	case CHIP_VEGA20:
-		adev->gfx.ras_funcs = &gfx_v9_0_ras_funcs;
+		adev->gfx.ras = &gfx_v9_0_ras;
 		adev->gfx.config.max_hw_contexts = 8;
 		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
 		adev->gfx.config.sc_prim_fifo_size_backend = 0x100;
@@ -2187,7 +2208,7 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
 			gb_addr_config = RAVEN_GB_ADDR_CONFIG_GOLDEN;
 		break;
 	case CHIP_ARCTURUS:
-		adev->gfx.ras_funcs = &gfx_v9_4_ras_funcs;
+		adev->gfx.ras = &gfx_v9_4_ras;
 		adev->gfx.config.max_hw_contexts = 8;
 		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
 		adev->gfx.config.sc_prim_fifo_size_backend = 0x100;
@@ -2208,7 +2229,7 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
 		gb_addr_config |= 0x22010042;
 		break;
 	case CHIP_ALDEBARAN:
-		adev->gfx.ras_funcs = &gfx_v9_4_2_ras_funcs;
+		adev->gfx.ras = &gfx_v9_4_2_ras;
 		adev->gfx.config.max_hw_contexts = 8;
 		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
 		adev->gfx.config.sc_prim_fifo_size_backend = 0x100;
@@ -2227,6 +2248,14 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
 		break;
 	}
 
+	if (adev->gfx.ras) {
+		err = amdgpu_ras_register_ras_block(adev, &adev->gfx.ras->ras_block);
+		if (err) {
+			DRM_ERROR("Failed to register gfx ras block!\n");
+			return err;
+		}
+	}
+
 	adev->gfx.config.gb_addr_config = gb_addr_config;
 
 	adev->gfx.config.gb_addr_config_fields.num_pipes = 1 <<
@@ -2448,9 +2477,9 @@ static int gfx_v9_0_sw_fini(void *handle)
 	int i;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-	if (adev->gfx.ras_funcs &&
-	    adev->gfx.ras_funcs->ras_fini)
-		adev->gfx.ras_funcs->ras_fini(adev);
+	if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
+	    adev->gfx.ras->ras_block.ops->ras_fini)
+		adev->gfx.ras->ras_block.ops->ras_fini(adev);
 
 	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
 		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
@@ -4888,16 +4917,16 @@ static int gfx_v9_0_ecc_late_init(void *handle)
 	if (r)
 		return r;
 
-	if (adev->gfx.ras_funcs &&
-	    adev->gfx.ras_funcs->ras_late_init) {
-		r = adev->gfx.ras_funcs->ras_late_init(adev);
+	if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
+	    adev->gfx.ras->ras_block.ops->ras_late_init) {
+		r = adev->gfx.ras->ras_block.ops->ras_late_init(adev);
 		if (r)
 			return r;
 	}
 
-	if (adev->gfx.ras_funcs &&
-	    adev->gfx.ras_funcs->enable_watchdog_timer)
-		adev->gfx.ras_funcs->enable_watchdog_timer(adev);
+	if (adev->gfx.ras &&
+	    adev->gfx.ras->enable_watchdog_timer)
+		adev->gfx.ras->enable_watchdog_timer(adev);
 
 	return 0;
 }
@@ -6841,7 +6870,7 @@ static void gfx_v9_0_reset_ras_error_count(struct amdgpu_device *adev)
 	WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_INDEX, 255);
 }
 
-static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
+static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
 					  void *ras_error_status)
 {
 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
@@ -6850,7 +6879,7 @@ static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
 	uint32_t reg_value;
 
 	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
-		return -EINVAL;
+		return;
 
 	err_data->ue_count = 0;
 	err_data->ce_count = 0;
@@ -6879,8 +6908,6 @@ static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
 	mutex_unlock(&adev->grbm_idx_mutex);
 
 	gfx_v9_0_query_utc_edc_status(adev, err_data);
-
-	return 0;
 }
 
 static void gfx_v9_0_emit_mem_sync(struct amdgpu_ring *ring)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
index b4789dfc2bb9..2d816addbd4d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
@@ -863,7 +863,7 @@ static int gfx_v9_4_ras_error_count(struct amdgpu_device *adev,
 	return 0;
 }
 
-static int gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
+static void gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
 					  void *ras_error_status)
 {
 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
@@ -872,7 +872,7 @@ static int gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
 	uint32_t reg_value;
 
 	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
-		return -EINVAL;
+		return;
 
 	err_data->ue_count = 0;
 	err_data->ce_count = 0;
@@ -903,7 +903,6 @@ static int gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
 
 	gfx_v9_4_query_utc_edc_status(adev, err_data);
 
-	return 0;
 }
 
 static void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev)
@@ -1029,11 +1028,31 @@ static void gfx_v9_4_query_ras_error_status(struct amdgpu_device *adev)
 	mutex_unlock(&adev->grbm_idx_mutex);
 }
 
-const struct amdgpu_gfx_ras_funcs gfx_v9_4_ras_funcs = {
-        .ras_late_init = amdgpu_gfx_ras_late_init,
-        .ras_fini = amdgpu_gfx_ras_fini,
-        .ras_error_inject = &gfx_v9_4_ras_error_inject,
-        .query_ras_error_count = &gfx_v9_4_query_ras_error_count,
-        .reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
-        .query_ras_error_status = &gfx_v9_4_query_ras_error_status,
+static int gfx_v9_4_ras_block_match(struct amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)
+{
+	if(!block_obj)
+		return -EINVAL;
+
+	if(block_obj->block == block) {
+		return 0;
+	}
+	return -EINVAL;
+}
+
+const struct amdgpu_ras_block_ops  gfx_v9_4_ras_ops = {
+	.ras_block_match = gfx_v9_4_ras_block_match,
+	.ras_late_init = amdgpu_gfx_ras_late_init,
+	.ras_fini = amdgpu_gfx_ras_fini,
+	.ras_error_inject = &gfx_v9_4_ras_error_inject,
+	.query_ras_error_count = &gfx_v9_4_query_ras_error_count,
+	.reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
+	.query_ras_error_status = &gfx_v9_4_query_ras_error_status,
+};
+
+struct amdgpu_gfx_ras gfx_v9_4_ras = {
+	.ras_block = {
+		.name = "gfx",
+		.block = AMDGPU_RAS_BLOCK__GFX,
+		.ops = &gfx_v9_4_ras_ops,
+	},
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
index bdd16b568021..ca520a767267 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
@@ -24,6 +24,6 @@
 #ifndef __GFX_V9_4_H__
 #define __GFX_V9_4_H__
 
-extern const struct amdgpu_gfx_ras_funcs gfx_v9_4_ras_funcs;
+extern struct amdgpu_gfx_ras gfx_v9_4_ras;
 
 #endif /* __GFX_V9_4_H__ */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
index 54306fd45ff1..2744709fa09d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -1644,14 +1644,14 @@ static int gfx_v9_4_2_query_utc_edc_count(struct amdgpu_device *adev,
 	return 0;
 }
 
-static int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev,
+static void gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev,
 					    void *ras_error_status)
 {
 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
 	uint32_t sec_count = 0, ded_count = 0;
 
 	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
-		return -EINVAL;
+		return;
 
 	err_data->ue_count = 0;
 	err_data->ce_count = 0;
@@ -1664,7 +1664,6 @@ static int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev,
 	err_data->ce_count += sec_count;
 	err_data->ue_count += ded_count;
 
-	return 0;
 }
 
 static void gfx_v9_4_2_reset_utc_err_status(struct amdgpu_device *adev)
@@ -1934,13 +1933,34 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev)
 	mutex_unlock(&adev->grbm_idx_mutex);
 }
 
-const struct amdgpu_gfx_ras_funcs gfx_v9_4_2_ras_funcs = {
-	.ras_late_init = amdgpu_gfx_ras_late_init,
-	.ras_fini = amdgpu_gfx_ras_fini,
-	.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
-	.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
-	.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
-	.query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
-	.reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status,
+static int gfx_v9_4_2_ras_block_match(struct amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)
+{
+	if(!block_obj)
+		return -EINVAL;
+
+	if(block_obj->block == block) {
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+struct amdgpu_ras_block_ops  gfx_v9_4_2_ras_ops ={
+		.ras_block_match = gfx_v9_4_2_ras_block_match,
+		.ras_late_init = amdgpu_gfx_ras_late_init,
+		.ras_fini = amdgpu_gfx_ras_fini,
+		.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
+		.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
+		.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
+		.query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
+		.reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status,
+};
+
+struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
+	.ras_block = {
+		.name = "gfx",
+		.block = AMDGPU_RAS_BLOCK__GFX,
+		.ops = &gfx_v9_4_2_ras_ops,
+	},
 	.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
index 6db1f88509af..7584624b641c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
@@ -31,6 +31,6 @@ void gfx_v9_4_2_init_golden_registers(struct amdgpu_device *adev,
 void gfx_v9_4_2_set_power_brake_sequence(struct amdgpu_device *adev);
 int gfx_v9_4_2_do_edc_gpr_workarounds(struct amdgpu_device *adev);
 
-extern const struct amdgpu_gfx_ras_funcs gfx_v9_4_2_ras_funcs;
+extern struct amdgpu_gfx_ras gfx_v9_4_2_ras;
 
 #endif /* __GFX_V9_4_2_H__ */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH V2 04/11] drm/amdgpu: Modify gmc block to fit for the unified ras block data and ops
  2021-12-01 10:52 [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras block yipechai
  2021-12-01 10:52 ` [PATCH V2 02/11] drm/amdgpu: Modify the compilation failed problem when other ras blocks' .h include amdgpu_ras.h yipechai
  2021-12-01 10:52 ` [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the unified ras block data and ops yipechai
@ 2021-12-01 10:52 ` yipechai
  2021-12-01 10:52 ` [PATCH V2 05/11] drm/amdgpu: Modify hdp " yipechai
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 20+ messages in thread
From: yipechai @ 2021-12-01 10:52 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, Hawking.Zhang, yipechai, yipechai

1.Modify gmc block to fit for the unified ras block data and ops
2.Implement .ras_block_match function pointer for gmc block to identify itself.
3.Change amdgpu_xgmi_ras_funcs to amdgpu_xgmi_ras, and the corresponding variable name remove _funcs suffix.
4.Remove the const flag of gmc ras variable so that gmc ras block can be able to be insertted into amdgpu device ras block link list.
5.Invoke amdgpu_ras_register_ras_block function to register gmc ras block into amdgpu device ras block link list.
6.Remove the redundant code about gmc in amdgpu_ras.c after using the unified ras block.

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  | 18 ++++++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h  | 11 +++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 10 +++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 31 +++++++++++++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h |  4 +--
 5 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 83f26bca7dac..3ba2f0f1f1b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -448,12 +448,14 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
 			return r;
 	}
 
-	if (!adev->gmc.xgmi.connected_to_cpu)
-		adev->gmc.xgmi.ras_funcs = &xgmi_ras_funcs;
+	if (!adev->gmc.xgmi.connected_to_cpu) {
+		adev->gmc.xgmi.ras = &xgmi_ras;
+		amdgpu_ras_register_ras_block(adev, &adev->gmc.xgmi.ras->ras_block);
+	}
 
-	if (adev->gmc.xgmi.ras_funcs &&
-	    adev->gmc.xgmi.ras_funcs->ras_late_init) {
-		r = adev->gmc.xgmi.ras_funcs->ras_late_init(adev);
+	if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ops &&
+	    adev->gmc.xgmi.ras->ras_block.ops->ras_late_init) {
+		r = adev->gmc.xgmi.ras->ras_block.ops->ras_late_init(adev);
 		if (r)
 			return r;
 	}
@@ -499,9 +501,9 @@ void amdgpu_gmc_ras_fini(struct amdgpu_device *adev)
 	    adev->mmhub.ras_funcs->ras_fini)
 		adev->mmhub.ras_funcs->ras_fini(adev);
 
-	if (adev->gmc.xgmi.ras_funcs &&
-	    adev->gmc.xgmi.ras_funcs->ras_fini)
-		adev->gmc.xgmi.ras_funcs->ras_fini(adev);
+	if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ops &&
+	    adev->gmc.xgmi.ras->ras_block.ops->ras_fini)
+		adev->gmc.xgmi.ras->ras_block.ops->ras_fini(adev);
 
 	if (adev->hdp.ras_funcs &&
 	    adev->hdp.ras_funcs->ras_fini)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index e55201134a01..923db5ff5859 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -29,6 +29,7 @@
 #include <linux/types.h>
 
 #include "amdgpu_irq.h"
+#include "amdgpu_ras.h"
 
 /* VA hole for 48bit addresses on Vega10 */
 #define AMDGPU_GMC_HOLE_START	0x0000800000000000ULL
@@ -135,12 +136,8 @@ struct amdgpu_gmc_funcs {
 	unsigned int (*get_vbios_fb_size)(struct amdgpu_device *adev);
 };
 
-struct amdgpu_xgmi_ras_funcs {
-	int (*ras_late_init)(struct amdgpu_device *adev);
-	void (*ras_fini)(struct amdgpu_device *adev);
-	int (*query_ras_error_count)(struct amdgpu_device *adev,
-				     void *ras_error_status);
-	void (*reset_ras_error_count)(struct amdgpu_device *adev);
+struct amdgpu_xgmi_ras {
+	struct amdgpu_ras_block_object ras_block;
 };
 
 struct amdgpu_xgmi {
@@ -159,7 +156,7 @@ struct amdgpu_xgmi {
 	struct ras_common_if *ras_if;
 	bool connected_to_cpu;
 	bool pending_reset;
-	const struct amdgpu_xgmi_ras_funcs *ras_funcs;
+	struct amdgpu_xgmi_ras *ras;
 };
 
 struct amdgpu_gmc {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 190a4a4e9d7a..a6a2f928c6ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -970,9 +970,13 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 			adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data);
 		break;
 	case AMDGPU_RAS_BLOCK__XGMI_WAFL:
-		if (adev->gmc.xgmi.ras_funcs &&
-		    adev->gmc.xgmi.ras_funcs->query_ras_error_count)
-			adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data);
+		if (!block_obj || !block_obj->ops)	{
+			dev_info(adev->dev, "%s don't config ras function \n",
+				get_ras_block_str(&info->head));
+			return -EINVAL;
+		}
+		if (block_obj->ops->query_ras_error_count)
+			block_obj->ops->query_ras_error_count(adev, &err_data);
 		break;
 	case AMDGPU_RAS_BLOCK__HDP:
 		if (adev->hdp.ras_funcs &&
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 0d149f5f000e..da541c7b1ec2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -739,7 +739,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
 	    adev->gmc.xgmi.num_physical_nodes == 0)
 		return 0;
 
-	adev->gmc.xgmi.ras_funcs->reset_ras_error_count(adev);
+	adev->gmc.xgmi.ras->ras_block.ops->reset_ras_error_count(adev);
 
 	if (!adev->gmc.xgmi.ras_if) {
 		adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
@@ -859,7 +859,7 @@ static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
 	return 0;
 }
 
-static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
+static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
 					     void *ras_error_status)
 {
 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
@@ -868,7 +868,7 @@ static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
 	uint32_t ue_cnt = 0, ce_cnt = 0;
 
 	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
-		return -EINVAL;
+		return ;
 
 	err_data->ue_count = 0;
 	err_data->ce_count = 0;
@@ -934,17 +934,36 @@ static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
 		break;
 	}
 
-	adev->gmc.xgmi.ras_funcs->reset_ras_error_count(adev);
+	adev->gmc.xgmi.ras->ras_block.ops->reset_ras_error_count(adev);
 
 	err_data->ue_count += ue_cnt;
 	err_data->ce_count += ce_cnt;
+}
 
-	return 0;
+static int amdgpu_xgmi_ras_block_match(struct amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)
+{
+	if(!block_obj)
+		return -EINVAL;
+
+	if(block_obj->block == block) {
+		return 0;
+	}
+
+	return -EINVAL;
 }
 
-const struct amdgpu_xgmi_ras_funcs xgmi_ras_funcs = {
+struct amdgpu_ras_block_ops  xgmi_ras_ops = {
+	.ras_block_match = amdgpu_xgmi_ras_block_match,
 	.ras_late_init = amdgpu_xgmi_ras_late_init,
 	.ras_fini = amdgpu_xgmi_ras_fini,
 	.query_ras_error_count = amdgpu_xgmi_query_ras_error_count,
 	.reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count,
 };
+
+struct amdgpu_xgmi_ras xgmi_ras = {
+	.ras_block = {
+		.name = "xgmi",
+		.block = AMDGPU_RAS_BLOCK__XGMI_WAFL,
+		.ops = &xgmi_ras_ops,
+	}
+};
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index d2189bf7d428..0afca51c3c0c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -24,7 +24,7 @@
 
 #include <drm/task_barrier.h>
 #include "amdgpu_psp.h"
-
+#include "amdgpu_ras.h"
 
 struct amdgpu_hive_info {
 	struct kobject kobj;
@@ -50,7 +50,7 @@ struct amdgpu_pcs_ras_field {
 	uint32_t pcs_err_shift;
 };
 
-extern const struct amdgpu_xgmi_ras_funcs xgmi_ras_funcs;
+extern struct amdgpu_xgmi_ras  xgmi_ras;
 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
 void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive);
 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH V2 05/11] drm/amdgpu: Modify hdp block to fit for the unified ras block data and ops
  2021-12-01 10:52 [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras block yipechai
                   ` (2 preceding siblings ...)
  2021-12-01 10:52 ` [PATCH V2 04/11] drm/amdgpu: Modify gmc " yipechai
@ 2021-12-01 10:52 ` yipechai
  2021-12-01 10:52 ` [PATCH V2 06/11] drm/amdgpu: Modify mmhub " yipechai
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 20+ messages in thread
From: yipechai @ 2021-12-01 10:52 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, Hawking.Zhang, yipechai, yipechai

1.Modify hdp block to fit for the unified ras block data and ops.
2.Implement .ras_block_match function pointer for hdp block to identify itself.
3.Change amdgpu_hdp_ras_funcs to amdgpu_hdp_ras, and the corresponding variable name remove _funcs suffix.
4.Remove the const flag of hdp ras variable so that hdp ras block can be able to be insertted into amdgpu device ras block link list.
5.Invoke amdgpu_ras_register_ras_block function to register hdp ras block into amdgpu device ras block link list.
6.Remove the redundant code about hdp in amdgpu_ras.c after using the unified ras block.

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 12 ++++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h | 11 ++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 ++++++++--------
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  9 +++++----
 drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c   | 22 +++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/hdp_v4_0.h   |  2 +-
 6 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 3ba2f0f1f1b4..0d06e7a2b951 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -460,9 +460,9 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
 			return r;
 	}
 
-	if (adev->hdp.ras_funcs &&
-	    adev->hdp.ras_funcs->ras_late_init) {
-		r = adev->hdp.ras_funcs->ras_late_init(adev);
+	if (adev->hdp.ras && adev->hdp.ras->ras_block.ops &&
+	    adev->hdp.ras->ras_block.ops->ras_late_init) {
+		r = adev->hdp.ras->ras_block.ops->ras_late_init(adev);
 		if (r)
 			return r;
 	}
@@ -505,9 +505,9 @@ void amdgpu_gmc_ras_fini(struct amdgpu_device *adev)
 	    adev->gmc.xgmi.ras->ras_block.ops->ras_fini)
 		adev->gmc.xgmi.ras->ras_block.ops->ras_fini(adev);
 
-	if (adev->hdp.ras_funcs &&
-	    adev->hdp.ras_funcs->ras_fini)
-		adev->hdp.ras_funcs->ras_fini(adev);
+	if (adev->hdp.ras && adev->hdp.ras->ras_block.ops &&
+	    adev->hdp.ras->ras_block.ops->ras_fini)
+		adev->hdp.ras->ras_block.ops->ras_fini(adev);
 }
 
 	/*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
index 7ec99d591584..6e53898fb283 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
@@ -22,13 +22,10 @@
  */
 #ifndef __AMDGPU_HDP_H__
 #define __AMDGPU_HDP_H__
+#include "amdgpu_ras.h"
 
-struct amdgpu_hdp_ras_funcs {
-	int (*ras_late_init)(struct amdgpu_device *adev);
-	void (*ras_fini)(struct amdgpu_device *adev);
-	void (*query_ras_error_count)(struct amdgpu_device *adev,
-				      void *ras_error_status);
-	void (*reset_ras_error_count)(struct amdgpu_device *adev);
+struct amdgpu_hdp_ras{
+	struct amdgpu_ras_block_object ras_block;
 };
 
 struct amdgpu_hdp_funcs {
@@ -43,7 +40,7 @@ struct amdgpu_hdp_funcs {
 struct amdgpu_hdp {
 	struct ras_common_if			*ras_if;
 	const struct amdgpu_hdp_funcs		*funcs;
-	const struct amdgpu_hdp_ras_funcs	*ras_funcs;
+	struct amdgpu_hdp_ras 	*ras;
 };
 
 int amdgpu_hdp_ras_late_init(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index a6a2f928c6ca..bed414404c6f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -970,6 +970,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 			adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data);
 		break;
 	case AMDGPU_RAS_BLOCK__XGMI_WAFL:
+	case AMDGPU_RAS_BLOCK__HDP:
 		if (!block_obj || !block_obj->ops)	{
 			dev_info(adev->dev, "%s don't config ras function \n",
 				get_ras_block_str(&info->head));
@@ -978,11 +979,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 		if (block_obj->ops->query_ras_error_count)
 			block_obj->ops->query_ras_error_count(adev, &err_data);
 		break;
-	case AMDGPU_RAS_BLOCK__HDP:
-		if (adev->hdp.ras_funcs &&
-		    adev->hdp.ras_funcs->query_ras_error_count)
-			adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data);
-		break;
 	case AMDGPU_RAS_BLOCK__MCA:
 		amdgpu_ras_mca_query_error_status(adev, &info->head, &err_data);
 		break;
@@ -1074,9 +1070,13 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
 			adev->sdma.funcs->reset_ras_error_count(adev);
 		break;
 	case AMDGPU_RAS_BLOCK__HDP:
-		if (adev->hdp.ras_funcs &&
-		    adev->hdp.ras_funcs->reset_ras_error_count)
-			adev->hdp.ras_funcs->reset_ras_error_count(adev);
+		if (!block_obj || !block_obj->ops)	{
+			dev_info(adev->dev, "%s don't config ras function \n", ras_block_str(block));
+			return -EINVAL;
+		}
+
+		if (block_obj->ops->reset_ras_error_count)
+			block_obj->ops->reset_ras_error_count(adev);
 		break;
 	default:
 		break;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 3606d2cbff5e..c66dc13e256f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1223,7 +1223,8 @@ static void gmc_v9_0_set_gfxhub_funcs(struct amdgpu_device *adev)
 
 static void gmc_v9_0_set_hdp_ras_funcs(struct amdgpu_device *adev)
 {
-	adev->hdp.ras_funcs = &hdp_v4_0_ras_funcs;
+	adev->hdp.ras = &hdp_v4_0_ras;
+	amdgpu_ras_register_ras_block(adev, &adev->hdp.ras->ras_block);
 }
 
 static void gmc_v9_0_set_mca_funcs(struct amdgpu_device *adev)
@@ -1300,9 +1301,9 @@ static int gmc_v9_0_late_init(void *handle)
 		    adev->mmhub.ras_funcs->reset_ras_error_count)
 			adev->mmhub.ras_funcs->reset_ras_error_count(adev);
 
-		if (adev->hdp.ras_funcs &&
-		    adev->hdp.ras_funcs->reset_ras_error_count)
-			adev->hdp.ras_funcs->reset_ras_error_count(adev);
+		if (adev->hdp.ras && adev->hdp.ras->ras_block.ops &&
+		    adev->hdp.ras->ras_block.ops->reset_ras_error_count)
+			adev->hdp.ras->ras_block.ops->reset_ras_error_count(adev);
 	}
 
 	r = amdgpu_gmc_ras_late_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
index 74b90cc2bf48..8b15843ea52a 100644
--- a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
@@ -149,13 +149,33 @@ static void hdp_v4_0_init_registers(struct amdgpu_device *adev)
 	WREG32_SOC15(HDP, 0, mmHDP_NONSURFACE_BASE_HI, (adev->gmc.vram_start >> 40));
 }
 
-const struct amdgpu_hdp_ras_funcs hdp_v4_0_ras_funcs = {
+static int hdp_v4_0_ras_block_match(struct amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)
+{
+	if(!block_obj)
+		return -EINVAL;
+
+	if(block_obj->block == block) {
+		return 0;
+	}
+	return -EINVAL;
+}
+
+struct amdgpu_ras_block_ops hdp_v4_0_ras_ops = {
+	.ras_block_match = hdp_v4_0_ras_block_match,
 	.ras_late_init = amdgpu_hdp_ras_late_init,
 	.ras_fini = amdgpu_hdp_ras_fini,
 	.query_ras_error_count = hdp_v4_0_query_ras_error_count,
 	.reset_ras_error_count = hdp_v4_0_reset_ras_error_count,
 };
 
+struct amdgpu_hdp_ras hdp_v4_0_ras = {
+	.ras_block = {
+		.name = "hdp",
+		.block = AMDGPU_RAS_BLOCK__HDP,
+		.ops = &hdp_v4_0_ras_ops,
+	},
+};
+
 const struct amdgpu_hdp_funcs hdp_v4_0_funcs = {
 	.flush_hdp = hdp_v4_0_flush_hdp,
 	.invalidate_hdp = hdp_v4_0_invalidate_hdp,
diff --git a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.h b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.h
index dc3a1b81dd62..c44eee9282ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.h
@@ -27,6 +27,6 @@
 #include "soc15_common.h"
 
 extern const struct amdgpu_hdp_funcs hdp_v4_0_funcs;
-extern const struct amdgpu_hdp_ras_funcs hdp_v4_0_ras_funcs;
+extern struct amdgpu_hdp_ras  hdp_v4_0_ras;
 
 #endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH V2 06/11] drm/amdgpu: Modify mmhub block to fit for the unified ras block data and ops
  2021-12-01 10:52 [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras block yipechai
                   ` (3 preceding siblings ...)
  2021-12-01 10:52 ` [PATCH V2 05/11] drm/amdgpu: Modify hdp " yipechai
@ 2021-12-01 10:52 ` yipechai
  2021-12-01 10:52 ` [PATCH V2 07/11] drm/amdgpu: Modify nbio " yipechai
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 20+ messages in thread
From: yipechai @ 2021-12-01 10:52 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, Hawking.Zhang, yipechai, yipechai

1.Modify mmhub block to fit for the unified ras block data and ops.
2.Implement .ras_block_match function pointer for mmhub block to identify itself.
3.Change amdgpu_mmhub_ras_funcs to amdgpu_mmhub_ras, and the corresponding variable name remove _funcs suffix.
4.Remove the const flag of mmhub ras variable so that mmhub ras block can be able to be insertted into amdgpu device ras block link list.
5.Invoke amdgpu_ras_register_ras_block function to register mmhub ras block into amdgpu device ras block link list. 5.Remove the redundant code about mmhub in amdgpu_ras.c after using the unified ras block.
6.Remove the redundant code about mmhub in amdgpu_ras.c after using the unified ras block.

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c    | 12 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h  | 12 ++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 49 +++++++---------------
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      | 16 ++++---
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c    | 23 +++++++++-
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.h    |  2 +-
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c    | 23 +++++++++-
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.h    |  2 +-
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c    | 23 +++++++++-
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.h    |  2 +-
 11 files changed, 108 insertions(+), 68 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0980396ee709..c7d5592f0cf6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3377,9 +3377,9 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 		if (adev->asic_reset_res)
 			goto fail;
 
-		if (adev->mmhub.ras_funcs &&
-		    adev->mmhub.ras_funcs->reset_ras_error_count)
-			adev->mmhub.ras_funcs->reset_ras_error_count(adev);
+		if (adev->mmhub.ras && adev->mmhub.ras->ras_block.ops &&
+		    adev->mmhub.ras->ras_block.ops->reset_ras_error_count)
+			adev->mmhub.ras->ras_block.ops->reset_ras_error_count(adev);
 	} else {
 
 		task_barrier_full(&hive->tb);
@@ -4705,9 +4705,9 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 
 	if (!r && amdgpu_ras_intr_triggered()) {
 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
-			if (tmp_adev->mmhub.ras_funcs &&
-			    tmp_adev->mmhub.ras_funcs->reset_ras_error_count)
-				tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev);
+			if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.ops &&
+			    tmp_adev->mmhub.ras->ras_block.ops->reset_ras_error_count)
+				tmp_adev->mmhub.ras->ras_block.ops->reset_ras_error_count(tmp_adev);
 		}
 
 		amdgpu_ras_intr_cleared();
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 0d06e7a2b951..317b5e93a1f0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -441,9 +441,9 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
 			return r;
 	}
 
-	if (adev->mmhub.ras_funcs &&
-	    adev->mmhub.ras_funcs->ras_late_init) {
-		r = adev->mmhub.ras_funcs->ras_late_init(adev);
+	if (adev->mmhub.ras && adev->mmhub.ras->ras_block.ops &&
+	    adev->mmhub.ras->ras_block.ops->ras_late_init) {
+		r = adev->mmhub.ras->ras_block.ops->ras_late_init(adev);
 		if (r)
 			return r;
 	}
@@ -497,9 +497,9 @@ void amdgpu_gmc_ras_fini(struct amdgpu_device *adev)
 	    adev->umc.ras_funcs->ras_fini)
 		adev->umc.ras_funcs->ras_fini(adev);
 
-	if (adev->mmhub.ras_funcs &&
-	    adev->mmhub.ras_funcs->ras_fini)
-		adev->mmhub.ras_funcs->ras_fini(adev);
+	if (adev->mmhub.ras && adev->mmhub.ras->ras_block.ops &&
+	    adev->mmhub.ras->ras_block.ops->ras_fini)
+		adev->mmhub.ras->ras_block.ops->ras_fini(adev);
 
 	if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ops &&
 	    adev->gmc.xgmi.ras->ras_block.ops->ras_fini)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
index b27fcbccce2b..6d10b3f248db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
@@ -21,14 +21,8 @@
 #ifndef __AMDGPU_MMHUB_H__
 #define __AMDGPU_MMHUB_H__
 
-struct amdgpu_mmhub_ras_funcs {
-	int (*ras_late_init)(struct amdgpu_device *adev);
-	void (*ras_fini)(struct amdgpu_device *adev);
-	void (*query_ras_error_count)(struct amdgpu_device *adev,
-				      void *ras_error_status);
-	void (*query_ras_error_status)(struct amdgpu_device *adev);
-	void (*reset_ras_error_count)(struct amdgpu_device *adev);
-	void (*reset_ras_error_status)(struct amdgpu_device *adev);
+struct amdgpu_mmhub_ras {
+	struct amdgpu_ras_block_object ras_block;
 };
 
 struct amdgpu_mmhub_funcs {
@@ -50,7 +44,7 @@ struct amdgpu_mmhub_funcs {
 struct amdgpu_mmhub {
 	struct ras_common_if *ras_if;
 	const struct amdgpu_mmhub_funcs *funcs;
-	const struct amdgpu_mmhub_ras_funcs *ras_funcs;
+	struct amdgpu_mmhub_ras  *ras;
 };
 
 int amdgpu_mmhub_ras_late_init(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index bed414404c6f..d705d8b1daf6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -943,6 +943,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 		}
 		break;
 	case AMDGPU_RAS_BLOCK__GFX:
+	case AMDGPU_RAS_BLOCK__MMHUB:
 		if (!block_obj || !block_obj->ops)	{
 			dev_info(adev->dev, "%s don't config ras function \n",
 				get_ras_block_str(&info->head));
@@ -955,15 +956,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 		if (block_obj->ops->query_ras_error_status)
 			block_obj->ops->query_ras_error_status(adev);
 		break;
-	case AMDGPU_RAS_BLOCK__MMHUB:
-		if (adev->mmhub.ras_funcs &&
-		    adev->mmhub.ras_funcs->query_ras_error_count)
-			adev->mmhub.ras_funcs->query_ras_error_count(adev, &err_data);
-
-		if (adev->mmhub.ras_funcs &&
-		    adev->mmhub.ras_funcs->query_ras_error_status)
-			adev->mmhub.ras_funcs->query_ras_error_status(adev);
-		break;
 	case AMDGPU_RAS_BLOCK__PCIE_BIF:
 		if (adev->nbio.ras_funcs &&
 		    adev->nbio.ras_funcs->query_ras_error_count)
@@ -1046,6 +1038,7 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
 
 	switch (block) {
 	case AMDGPU_RAS_BLOCK__GFX:
+	case AMDGPU_RAS_BLOCK__MMHUB:
 		if (!block_obj || !block_obj->ops)	{
 			dev_info(adev->dev, "%s don't config ras function \n", ras_block_str(block));
 			return -EINVAL;
@@ -1056,15 +1049,6 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
 		if (block_obj->ops->reset_ras_error_status)
 			block_obj->ops->reset_ras_error_status(adev);
 		break;
-	case AMDGPU_RAS_BLOCK__MMHUB:
-		if (adev->mmhub.ras_funcs &&
-		    adev->mmhub.ras_funcs->reset_ras_error_count)
-			adev->mmhub.ras_funcs->reset_ras_error_count(adev);
-
-		if (adev->mmhub.ras_funcs &&
-		    adev->mmhub.ras_funcs->reset_ras_error_status)
-			adev->mmhub.ras_funcs->reset_ras_error_status(adev);
-		break;
 	case AMDGPU_RAS_BLOCK__SDMA:
 		if (adev->sdma.funcs->reset_ras_error_count)
 			adev->sdma.funcs->reset_ras_error_count(adev);
@@ -1764,29 +1748,24 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
 static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
 					  struct ras_query_if *info)
 {
-	struct amdgpu_ras_block_object* block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, info->head.sub_block_index);
+	struct amdgpu_ras_block_object* block_obj = NULL;
 	/*
 	 * Only two block need to query read/write
 	 * RspStatus at current state
 	 */
-	switch (info->head.block) {
-	case AMDGPU_RAS_BLOCK__GFX:
-		if (!block_obj || !block_obj->ops)	{
-			dev_info(adev->dev, "%s don't config ras function \n", get_ras_block_str(&info->head));
-			return ;
-		}
+	if ( (info->head.block != AMDGPU_RAS_BLOCK__GFX) &&
+		 (info->head.block != AMDGPU_RAS_BLOCK__MMHUB))
+		 return ;
 
-		if (block_obj->ops->query_ras_error_status)
-			block_obj->ops->query_ras_error_status(adev);
-		break;
-	case AMDGPU_RAS_BLOCK__MMHUB:
-		if (adev->mmhub.ras_funcs &&
-		    adev->mmhub.ras_funcs->query_ras_error_status)
-			adev->mmhub.ras_funcs->query_ras_error_status(adev);
-		break;
-	default:
-		break;
+	block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, info->head.sub_block_index);
+	if (!block_obj || !block_obj->ops)	{
+		dev_info(adev->dev, "%s don't config ras function \n", get_ras_block_str(&info->head));
+		return ;
 	}
+
+	if (block_obj->ops->query_ras_error_status)
+		block_obj->ops->query_ras_error_status(adev);
+
 }
 
 static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index c66dc13e256f..53ec18c595e8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1202,18 +1202,22 @@ static void gmc_v9_0_set_mmhub_ras_funcs(struct amdgpu_device *adev)
 {
 	switch (adev->asic_type) {
 	case CHIP_VEGA20:
-		adev->mmhub.ras_funcs = &mmhub_v1_0_ras_funcs;
+		adev->mmhub.ras = &mmhub_v1_0_ras;
 		break;
 	case CHIP_ARCTURUS:
-		adev->mmhub.ras_funcs = &mmhub_v9_4_ras_funcs;
+		adev->mmhub.ras = &mmhub_v9_4_ras;
 		break;
 	case CHIP_ALDEBARAN:
-		adev->mmhub.ras_funcs = &mmhub_v1_7_ras_funcs;
+		adev->mmhub.ras = &mmhub_v1_7_ras;
 		break;
 	default:
 		/* mmhub ras is not available */
 		break;
 	}
+
+	if(adev->mmhub.ras)
+		amdgpu_ras_register_ras_block(adev, &adev->mmhub.ras->ras_block);
+
 }
 
 static void gmc_v9_0_set_gfxhub_funcs(struct amdgpu_device *adev)
@@ -1297,9 +1301,9 @@ static int gmc_v9_0_late_init(void *handle)
 	}
 
 	if (!amdgpu_persistent_edc_harvesting_supported(adev)) {
-		if (adev->mmhub.ras_funcs &&
-		    adev->mmhub.ras_funcs->reset_ras_error_count)
-			adev->mmhub.ras_funcs->reset_ras_error_count(adev);
+		if (adev->mmhub.ras && adev->mmhub.ras->ras_block.ops &&
+		    adev->mmhub.ras->ras_block.ops->reset_ras_error_count)
+			adev->mmhub.ras->ras_block.ops->reset_ras_error_count(adev);
 
 		if (adev->hdp.ras && adev->hdp.ras->ras_block.ops &&
 		    adev->hdp.ras->ras_block.ops->reset_ras_error_count)
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
index b3bede1dc41d..da505314802a 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
@@ -774,13 +774,34 @@ static void mmhub_v1_0_reset_ras_error_count(struct amdgpu_device *adev)
 	}
 }
 
-const struct amdgpu_mmhub_ras_funcs mmhub_v1_0_ras_funcs = {
+static int mmhub_v1_0_ras_block_match(struct amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)
+{
+	if(!block_obj)
+		return -EINVAL;
+
+	if(block_obj->block == block) {
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+struct amdgpu_ras_block_ops mmhub_v1_0_ras_ops = {
+	.ras_block_match = mmhub_v1_0_ras_block_match,
 	.ras_late_init = amdgpu_mmhub_ras_late_init,
 	.ras_fini = amdgpu_mmhub_ras_fini,
 	.query_ras_error_count = mmhub_v1_0_query_ras_error_count,
 	.reset_ras_error_count = mmhub_v1_0_reset_ras_error_count,
 };
 
+struct amdgpu_mmhub_ras mmhub_v1_0_ras = {
+	.ras_block = {
+		.name = "mmhub",
+		.block = AMDGPU_RAS_BLOCK__MMHUB,
+		.ops = &mmhub_v1_0_ras_ops,
+	},
+};
+
 const struct amdgpu_mmhub_funcs mmhub_v1_0_funcs = {
 	.get_fb_location = mmhub_v1_0_get_fb_location,
 	.init = mmhub_v1_0_init,
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.h b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.h
index 4661b094e007..dae7ca48bd8b 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.h
@@ -24,6 +24,6 @@
 #define __MMHUB_V1_0_H__
 
 extern const struct amdgpu_mmhub_funcs mmhub_v1_0_funcs;
-extern const struct amdgpu_mmhub_ras_funcs mmhub_v1_0_ras_funcs;
+extern struct amdgpu_mmhub_ras mmhub_v1_0_ras;
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
index f5f7181f9af5..829d14ee87d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
@@ -1321,7 +1321,20 @@ static void mmhub_v1_7_reset_ras_error_status(struct amdgpu_device *adev)
 	}
 }
 
-const struct amdgpu_mmhub_ras_funcs mmhub_v1_7_ras_funcs = {
+static int mmhub_v1_7_ras_block_match(struct amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)
+{
+	if(!block_obj)
+		return -EINVAL;
+
+	if(block_obj->block == block) {
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+struct amdgpu_ras_block_ops mmhub_v1_7_ras_ops = {
+	.ras_block_match = mmhub_v1_7_ras_block_match,
 	.ras_late_init = amdgpu_mmhub_ras_late_init,
 	.ras_fini = amdgpu_mmhub_ras_fini,
 	.query_ras_error_count = mmhub_v1_7_query_ras_error_count,
@@ -1330,6 +1343,14 @@ const struct amdgpu_mmhub_ras_funcs mmhub_v1_7_ras_funcs = {
 	.reset_ras_error_status = mmhub_v1_7_reset_ras_error_status,
 };
 
+struct amdgpu_mmhub_ras mmhub_v1_7_ras = {
+	.ras_block = {
+		.name = "mmhub",
+		.block = AMDGPU_RAS_BLOCK__MMHUB,
+		.ops = &mmhub_v1_7_ras_ops,
+	},
+};
+
 const struct amdgpu_mmhub_funcs mmhub_v1_7_funcs = {
 	.get_fb_location = mmhub_v1_7_get_fb_location,
 	.init = mmhub_v1_7_init,
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.h b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.h
index a7f9dfc24697..629f49052137 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.h
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.h
@@ -24,6 +24,6 @@
 #define __MMHUB_V1_7_H__
 
 extern const struct amdgpu_mmhub_funcs mmhub_v1_7_funcs;
-extern const struct amdgpu_mmhub_ras_funcs mmhub_v1_7_ras_funcs;
+extern struct amdgpu_mmhub_ras mmhub_v1_7_ras;
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
index ff49eeaf7882..1edc98e5bcbb 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
@@ -1655,7 +1655,20 @@ static void mmhub_v9_4_query_ras_error_status(struct amdgpu_device *adev)
 	}
 }
 
-const struct amdgpu_mmhub_ras_funcs mmhub_v9_4_ras_funcs = {
+static int mmhub_v9_4_ras_block_match(struct amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)
+{
+	if(!block_obj)
+		return -EINVAL;
+
+	if(block_obj->block == block) {
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+const struct amdgpu_ras_block_ops mmhub_v9_4_ras_ops = {
+	.ras_block_match = mmhub_v9_4_ras_block_match,
 	.ras_late_init = amdgpu_mmhub_ras_late_init,
 	.ras_fini = amdgpu_mmhub_ras_fini,
 	.query_ras_error_count = mmhub_v9_4_query_ras_error_count,
@@ -1663,6 +1676,14 @@ const struct amdgpu_mmhub_ras_funcs mmhub_v9_4_ras_funcs = {
 	.query_ras_error_status = mmhub_v9_4_query_ras_error_status,
 };
 
+struct amdgpu_mmhub_ras mmhub_v9_4_ras = {
+	.ras_block = {
+		.name = "mmhub",
+		.block = AMDGPU_RAS_BLOCK__MMHUB,
+		.ops = &mmhub_v9_4_ras_ops,
+	},
+};
+
 const struct amdgpu_mmhub_funcs mmhub_v9_4_funcs = {
 	.get_fb_location = mmhub_v9_4_get_fb_location,
 	.init = mmhub_v9_4_init,
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.h b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.h
index 90436efa92ef..a48329d95f71 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.h
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.h
@@ -24,6 +24,6 @@
 #define __MMHUB_V9_4_H__
 
 extern const struct amdgpu_mmhub_funcs mmhub_v9_4_funcs;
-extern const struct amdgpu_mmhub_ras_funcs mmhub_v9_4_ras_funcs;
+extern struct amdgpu_mmhub_ras mmhub_v9_4_ras;
 
 #endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH V2 07/11] drm/amdgpu: Modify nbio block to fit for the unified ras block data and ops
  2021-12-01 10:52 [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras block yipechai
                   ` (4 preceding siblings ...)
  2021-12-01 10:52 ` [PATCH V2 06/11] drm/amdgpu: Modify mmhub " yipechai
@ 2021-12-01 10:52 ` yipechai
  2021-12-01 10:52 ` [PATCH V2 08/11] drm/amdgpu: Modify umc " yipechai
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 20+ messages in thread
From: yipechai @ 2021-12-01 10:52 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, Hawking.Zhang, yipechai, yipechai

1.Modify nbio block to fit for the unified ras block data and ops.
2.Implement .ras_block_match function pointer for nbio block to identify itself.
3.Change amdgpu_nbio_ras_funcs to amdgpu_nbio_ras, and the corresponding variable name remove _funcs suffix.
4.Remove the const flag of mmhub ras variable so that nbio ras block can be able to be insertted into amdgpu device ras block link list.
5.Invoke amdgpu_ras_register_ras_block function to register nbio ras block into amdgpu device ras block link list.
6.Remove the redundant code about nbio in amdgpu_ras.c after using the unified ras block.

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c  | 12 +++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  9 +++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 22 ++++++++---------
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c   | 30 ++++++++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/soc15.c       | 20 ++++++++--------
 6 files changed, 56 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
index 5208b2dd176a..24feceb51289 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
@@ -208,13 +208,13 @@ irqreturn_t amdgpu_irq_handler(int irq, void *arg)
 	 * ack the interrupt if it is there
 	 */
 	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) {
-		if (adev->nbio.ras_funcs &&
-		    adev->nbio.ras_funcs->handle_ras_controller_intr_no_bifring)
-			adev->nbio.ras_funcs->handle_ras_controller_intr_no_bifring(adev);
+		if (adev->nbio.ras &&
+		    adev->nbio.ras->handle_ras_controller_intr_no_bifring)
+			adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev);
 
-		if (adev->nbio.ras_funcs &&
-		    adev->nbio.ras_funcs->handle_ras_err_event_athub_intr_no_bifring)
-			adev->nbio.ras_funcs->handle_ras_err_event_athub_intr_no_bifring(adev);
+		if (adev->nbio.ras &&
+		    adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring)
+			adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev);
 	}
 
 	return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index 843052205bd5..4a1fb85939d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -47,15 +47,12 @@ struct nbio_hdp_flush_reg {
 	u32 ref_and_mask_sdma7;
 };
 
-struct amdgpu_nbio_ras_funcs {
+struct amdgpu_nbio_ras {
+	struct amdgpu_ras_block_object ras_block;
 	void (*handle_ras_controller_intr_no_bifring)(struct amdgpu_device *adev);
 	void (*handle_ras_err_event_athub_intr_no_bifring)(struct amdgpu_device *adev);
 	int (*init_ras_controller_interrupt)(struct amdgpu_device *adev);
 	int (*init_ras_err_event_athub_interrupt)(struct amdgpu_device *adev);
-	void (*query_ras_error_count)(struct amdgpu_device *adev,
-				      void *ras_error_status);
-	int (*ras_late_init)(struct amdgpu_device *adev);
-	void (*ras_fini)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_nbio_funcs {
@@ -104,7 +101,7 @@ struct amdgpu_nbio {
 	struct amdgpu_irq_src ras_err_event_athub_irq;
 	struct ras_common_if *ras_if;
 	const struct amdgpu_nbio_funcs *funcs;
-	const struct amdgpu_nbio_ras_funcs *ras_funcs;
+	struct amdgpu_nbio_ras  *ras;
 };
 
 int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d705d8b1daf6..273a550741e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -957,10 +957,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 			block_obj->ops->query_ras_error_status(adev);
 		break;
 	case AMDGPU_RAS_BLOCK__PCIE_BIF:
-		if (adev->nbio.ras_funcs &&
-		    adev->nbio.ras_funcs->query_ras_error_count)
-			adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data);
-		break;
 	case AMDGPU_RAS_BLOCK__XGMI_WAFL:
 	case AMDGPU_RAS_BLOCK__HDP:
 		if (!block_obj || !block_obj->ops)	{
@@ -2336,24 +2332,26 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 	case CHIP_VEGA20:
 	case CHIP_ARCTURUS:
 	case CHIP_ALDEBARAN:
-		if (!adev->gmc.xgmi.connected_to_cpu)
-			adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs;
+		if (!adev->gmc.xgmi.connected_to_cpu) {
+			adev->nbio.ras = &nbio_v7_4_ras;
+			amdgpu_ras_register_ras_block(adev, &adev->nbio.ras->ras_block);
+		}
 		break;
 	default:
 		/* nbio ras is not available */
 		break;
 	}
 
-	if (adev->nbio.ras_funcs &&
-	    adev->nbio.ras_funcs->init_ras_controller_interrupt) {
-		r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev);
+	if (adev->nbio.ras &&
+	    adev->nbio.ras->init_ras_controller_interrupt) {
+		r = adev->nbio.ras->init_ras_controller_interrupt(adev);
 		if (r)
 			goto release_con;
 	}
 
-	if (adev->nbio.ras_funcs &&
-	    adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) {
-		r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev);
+	if (adev->nbio.ras &&
+	    adev->nbio.ras->init_ras_err_event_athub_interrupt) {
+		r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev);
 		if (r)
 			goto release_con;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index 91b3afa946f5..14f7265d954e 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -638,16 +638,38 @@ static void nbio_v7_4_enable_doorbell_interrupt(struct amdgpu_device *adev,
 		       DOORBELL_INTERRUPT_DISABLE, enable ? 0 : 1);
 }
 
-const struct amdgpu_nbio_ras_funcs nbio_v7_4_ras_funcs = {
+static int nbio_v7_4_ras_block_match(struct amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)
+{
+	if(!block_obj)
+		return -EINVAL;
+
+	if(block_obj->block == block) {
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+const struct amdgpu_ras_block_ops nbio_v7_4_ras_ops = {
+	.ras_block_match = nbio_v7_4_ras_block_match,
+	.query_ras_error_count = nbio_v7_4_query_ras_error_count,
+	.ras_late_init = amdgpu_nbio_ras_late_init,
+	.ras_fini = amdgpu_nbio_ras_fini,
+};
+
+struct amdgpu_nbio_ras nbio_v7_4_ras = {
+	.ras_block = {
+		.name = "pcie_bif",
+		.block = AMDGPU_RAS_BLOCK__PCIE_BIF,
+		.ops = &nbio_v7_4_ras_ops,
+	},
 	.handle_ras_controller_intr_no_bifring = nbio_v7_4_handle_ras_controller_intr_no_bifring,
 	.handle_ras_err_event_athub_intr_no_bifring = nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring,
 	.init_ras_controller_interrupt = nbio_v7_4_init_ras_controller_interrupt,
 	.init_ras_err_event_athub_interrupt = nbio_v7_4_init_ras_err_event_athub_interrupt,
-	.query_ras_error_count = nbio_v7_4_query_ras_error_count,
-	.ras_late_init = amdgpu_nbio_ras_late_init,
-	.ras_fini = amdgpu_nbio_ras_fini,
 };
 
+
 static void nbio_v7_4_program_ltr(struct amdgpu_device *adev)
 {
 	uint32_t def, data;
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.h b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.h
index b8216581ec8d..f27c41728822 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.h
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.h
@@ -28,6 +28,6 @@
 
 extern const struct nbio_hdp_flush_reg nbio_v7_4_hdp_flush_reg;
 extern const struct amdgpu_nbio_funcs nbio_v7_4_funcs;
-extern const struct amdgpu_nbio_ras_funcs nbio_v7_4_ras_funcs;
+extern struct amdgpu_nbio_ras nbio_v7_4_ras;
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index f9d92b6deef0..897c7e784701 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -1385,9 +1385,9 @@ static int soc15_common_late_init(void *handle)
 	if (amdgpu_sriov_vf(adev))
 		xgpu_ai_mailbox_get_irq(adev);
 
-	if (adev->nbio.ras_funcs &&
-	    adev->nbio.ras_funcs->ras_late_init)
-		r = adev->nbio.ras_funcs->ras_late_init(adev);
+	if (adev->nbio.ras && adev->nbio.ras->ras_block.ops &&
+	    adev->nbio.ras->ras_block.ops->ras_late_init)
+		r = adev->nbio.ras->ras_block.ops->ras_late_init(adev);
 
 	return r;
 }
@@ -1408,9 +1408,9 @@ static int soc15_common_sw_fini(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-	if (adev->nbio.ras_funcs &&
-	    adev->nbio.ras_funcs->ras_fini)
-		adev->nbio.ras_funcs->ras_fini(adev);
+	if (adev->nbio.ras && adev->nbio.ras->ras_block.ops &&
+	    adev->nbio.ras->ras_block.ops->ras_fini)
+		adev->nbio.ras->ras_block.ops->ras_fini(adev);
 	adev->df.funcs->sw_fini(adev);
 	return 0;
 }
@@ -1474,11 +1474,11 @@ static int soc15_common_hw_fini(void *handle)
 
 	if (adev->nbio.ras_if &&
 	    amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
-		if (adev->nbio.ras_funcs &&
-		    adev->nbio.ras_funcs->init_ras_controller_interrupt)
+		if (adev->nbio.ras &&
+		    adev->nbio.ras->init_ras_controller_interrupt)
 			amdgpu_irq_put(adev, &adev->nbio.ras_controller_irq, 0);
-		if (adev->nbio.ras_funcs &&
-		    adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt)
+		if (adev->nbio.ras &&
+		    adev->nbio.ras->init_ras_err_event_athub_interrupt)
 			amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0);
 	}
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH V2 08/11] drm/amdgpu: Modify umc block to fit for the unified ras block data and ops
  2021-12-01 10:52 [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras block yipechai
                   ` (5 preceding siblings ...)
  2021-12-01 10:52 ` [PATCH V2 07/11] drm/amdgpu: Modify nbio " yipechai
@ 2021-12-01 10:52 ` yipechai
  2021-12-01 10:52 ` [PATCH V2 09/11] drm/amdgpu: Modify sdma " yipechai
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 20+ messages in thread
From: yipechai @ 2021-12-01 10:52 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, Hawking.Zhang, yipechai, yipechai

1.Modify umc block to fit for the unified ras block data and ops.
2.Implement .ras_block_match function pointer for umc block to identify itself.
3.Change amdgpu_umc_ras_funcs to amdgpu_umc_ras, and the corresponding variable name remove _funcs suffix.
4.Remove the const flag of umc ras variable so that umc ras block can be able to be insertted into amdgpu device ras block link list.
5.Invoke amdgpu_ras_register_ras_block function to register umc ras block into amdgpu device ras block link list.
6.Remove the redundant code about umc in amdgpu_ras.c after using the unified ras block.

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 12 ++++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 21 ++++++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 18 +++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 13 ++++---------
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  4 +++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  9 ++++++---
 drivers/gpu/drm/amd/amdgpu/umc_v6_1.c   | 25 +++++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/umc_v6_1.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.c   | 23 ++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/umc_v8_7.c   | 25 +++++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/umc_v8_7.h   |  2 +-
 12 files changed, 111 insertions(+), 45 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 317b5e93a1f0..ead143214448 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -434,9 +434,9 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
 {
 	int r;
 
-	if (adev->umc.ras_funcs &&
-	    adev->umc.ras_funcs->ras_late_init) {
-		r = adev->umc.ras_funcs->ras_late_init(adev);
+	if (adev->umc.ras && adev->umc.ras->ras_block.ops &&
+	    adev->umc.ras->ras_block.ops->ras_late_init) {
+		r = adev->umc.ras->ras_block.ops->ras_late_init(adev);
 		if (r)
 			return r;
 	}
@@ -493,9 +493,9 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
 
 void amdgpu_gmc_ras_fini(struct amdgpu_device *adev)
 {
-	if (adev->umc.ras_funcs &&
-	    adev->umc.ras_funcs->ras_fini)
-		adev->umc.ras_funcs->ras_fini(adev);
+	if (adev->umc.ras && adev->umc.ras->ras_block.ops &&
+	    adev->umc.ras->ras_block.ops->ras_fini)
+		adev->umc.ras->ras_block.ops->ras_fini(adev);
 
 	if (adev->mmhub.ras && adev->mmhub.ras->ras_block.ops &&
 	    adev->mmhub.ras->ras_block.ops->ras_fini)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 273a550741e4..7d050afd7e2e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -925,15 +925,18 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 
 	switch (info->head.block) {
 	case AMDGPU_RAS_BLOCK__UMC:
-		if (adev->umc.ras_funcs &&
-		    adev->umc.ras_funcs->query_ras_error_count)
-			adev->umc.ras_funcs->query_ras_error_count(adev, &err_data);
+		if (!block_obj || !block_obj->ops)	{
+			dev_info(adev->dev, "%s don't config ras function \n",
+				get_ras_block_str(&info->head));
+			return -EINVAL;
+		}
+		if (block_obj->ops->query_ras_error_count)
+			block_obj->ops->query_ras_error_count(adev, &err_data);
 		/* umc query_ras_error_address is also responsible for clearing
 		 * error status
 		 */
-		if (adev->umc.ras_funcs &&
-		    adev->umc.ras_funcs->query_ras_error_address)
-			adev->umc.ras_funcs->query_ras_error_address(adev, &err_data);
+		if (block_obj->ops->query_ras_error_address)
+			block_obj->ops->query_ras_error_address(adev, &err_data);
 		break;
 	case AMDGPU_RAS_BLOCK__SDMA:
 		if (adev->sdma.funcs->query_ras_error_count) {
@@ -2359,12 +2362,12 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 	/* Init poison supported flag, the default value is false */
 	if (adev->df.funcs &&
 	    adev->df.funcs->query_ras_poison_mode &&
-	    adev->umc.ras_funcs &&
-	    adev->umc.ras_funcs->query_ras_poison_mode) {
+	    adev->umc.ras && adev->umc.ras->ras_block.ops &&
+	    adev->umc.ras->ras_block.ops->query_ras_poison_mode) {
 		df_poison =
 			adev->df.funcs->query_ras_poison_mode(adev);
 		umc_poison =
-			adev->umc.ras_funcs->query_ras_poison_mode(adev);
+			adev->umc.ras->ras_block.ops->query_ras_poison_mode(adev);
 		/* Only poison is set in both DF and UMC, we can support it */
 		if (df_poison && umc_poison)
 			con->poison_supported = true;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 0c7c56a91b25..2624421b131e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -60,9 +60,9 @@ int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
 	}
 
 	/* ras init of specific umc version */
-	if (adev->umc.ras_funcs &&
-	    adev->umc.ras_funcs->err_cnt_init)
-		adev->umc.ras_funcs->err_cnt_init(adev);
+	if (adev->umc.ras &&
+	    adev->umc.ras->err_cnt_init)
+		adev->umc.ras->err_cnt_init(adev);
 
 	return 0;
 
@@ -97,12 +97,12 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
 	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
-	if (adev->umc.ras_funcs &&
-	    adev->umc.ras_funcs->query_ras_error_count)
-	    adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status);
+	if (adev->umc.ras && adev->umc.ras->ras_block.ops &&
+	    adev->umc.ras->ras_block.ops->query_ras_error_count)
+	    adev->umc.ras->ras_block.ops->query_ras_error_count(adev, ras_error_status);
 
-	if (adev->umc.ras_funcs &&
-	    adev->umc.ras_funcs->query_ras_error_address &&
+	if (adev->umc.ras && adev->umc.ras->ras_block.ops &&
+	    adev->umc.ras->ras_block.ops->query_ras_error_address &&
 	    adev->umc.max_ras_err_cnt_per_query) {
 		err_data->err_addr =
 			kcalloc(adev->umc.max_ras_err_cnt_per_query,
@@ -118,7 +118,7 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
 		/* umc query_ras_error_address is also responsible for clearing
 		 * error status
 		 */
-		adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status);
+		adev->umc.ras->ras_block.ops->query_ras_error_address(adev, ras_error_status);
 	}
 
 	/* only uncorrectable error needs gpu reset */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 1f5fe2315236..cf8af55d904a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -20,6 +20,7 @@
  */
 #ifndef __AMDGPU_UMC_H__
 #define __AMDGPU_UMC_H__
+#include "amdgpu_ras.h"
 
 /*
  * (addr / 256) * 4096, the higher 26 bits in ErrorAddr
@@ -40,15 +41,9 @@
 #define LOOP_UMC_CH_INST(ch_inst) for ((ch_inst) = 0; (ch_inst) < adev->umc.channel_inst_num; (ch_inst)++)
 #define LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) LOOP_UMC_INST((umc_inst)) LOOP_UMC_CH_INST((ch_inst))
 
-struct amdgpu_umc_ras_funcs {
+struct amdgpu_umc_ras {
+	struct amdgpu_ras_block_object ras_block;
 	void (*err_cnt_init)(struct amdgpu_device *adev);
-	int (*ras_late_init)(struct amdgpu_device *adev);
-	void (*ras_fini)(struct amdgpu_device *adev);
-	void (*query_ras_error_count)(struct amdgpu_device *adev,
-				      void *ras_error_status);
-	void (*query_ras_error_address)(struct amdgpu_device *adev,
-					void *ras_error_status);
-	bool (*query_ras_poison_mode)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_umc_funcs {
@@ -69,7 +64,7 @@ struct amdgpu_umc {
 	struct ras_common_if *ras_if;
 
 	const struct amdgpu_umc_funcs *funcs;
-	const struct amdgpu_umc_ras_funcs *ras_funcs;
+	struct amdgpu_umc_ras *ras;
 };
 
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index bbddb87d7d17..b12bd2c78778 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -663,11 +663,13 @@ static void gmc_v10_0_set_umc_funcs(struct amdgpu_device *adev)
 		adev->umc.umc_inst_num = UMC_V8_7_UMC_INSTANCE_NUM;
 		adev->umc.channel_offs = UMC_V8_7_PER_CHANNEL_OFFSET_SIENNA;
 		adev->umc.channel_idx_tbl = &umc_v8_7_channel_idx_tbl[0][0];
-		adev->umc.ras_funcs = &umc_v8_7_ras_funcs;
+		adev->umc.ras = &umc_v8_7_ras;
 		break;
 	default:
 		break;
 	}
+	if(adev->umc.ras)
+		amdgpu_ras_register_ras_block(adev, &adev->umc.ras->ras_block);
 }
 
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 53ec18c595e8..c1c4b1c6c1e7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1156,7 +1156,7 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev)
 		adev->umc.umc_inst_num = UMC_V6_1_UMC_INSTANCE_NUM;
 		adev->umc.channel_offs = UMC_V6_1_PER_CHANNEL_OFFSET_VG20;
 		adev->umc.channel_idx_tbl = &umc_v6_1_channel_idx_tbl[0][0];
-		adev->umc.ras_funcs = &umc_v6_1_ras_funcs;
+		adev->umc.ras = &umc_v6_1_ras;
 		break;
 	case CHIP_ARCTURUS:
 		adev->umc.max_ras_err_cnt_per_query = UMC_V6_1_TOTAL_CHANNEL_NUM;
@@ -1164,7 +1164,7 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev)
 		adev->umc.umc_inst_num = UMC_V6_1_UMC_INSTANCE_NUM;
 		adev->umc.channel_offs = UMC_V6_1_PER_CHANNEL_OFFSET_ARCT;
 		adev->umc.channel_idx_tbl = &umc_v6_1_channel_idx_tbl[0][0];
-		adev->umc.ras_funcs = &umc_v6_1_ras_funcs;
+		adev->umc.ras = &umc_v6_1_ras;
 		break;
 	case CHIP_ALDEBARAN:
 		adev->umc.max_ras_err_cnt_per_query = UMC_V6_7_TOTAL_CHANNEL_NUM;
@@ -1172,7 +1172,7 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev)
 		adev->umc.umc_inst_num = UMC_V6_7_UMC_INSTANCE_NUM;
 		adev->umc.channel_offs = UMC_V6_7_PER_CHANNEL_OFFSET;
 		if (!adev->gmc.xgmi.connected_to_cpu)
-			adev->umc.ras_funcs = &umc_v6_7_ras_funcs;
+			adev->umc.ras = &umc_v6_7_ras;
 		if (1 & adev->smuio.funcs->get_die_id(adev))
 			adev->umc.channel_idx_tbl = &umc_v6_7_channel_idx_tbl_first[0][0];
 		else
@@ -1181,6 +1181,9 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev)
 	default:
 		break;
 	}
+
+	if(adev->umc.ras)
+		amdgpu_ras_register_ras_block(adev, &adev->umc.ras->ras_block);
 }
 
 static void gmc_v9_0_set_mmhub_funcs(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
index 921da7dffb1c..ed480c2081a6 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
@@ -465,10 +465,31 @@ static void umc_v6_1_err_cnt_init(struct amdgpu_device *adev)
 		umc_v6_1_enable_umc_index_mode(adev);
 }
 
-const struct amdgpu_umc_ras_funcs umc_v6_1_ras_funcs = {
-	.err_cnt_init = umc_v6_1_err_cnt_init,
+static int umc_v6_1_ras_block_match(struct amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)
+{
+	if(!block_obj)
+		return -EINVAL;
+
+	if(block_obj->block == block) {
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+const struct amdgpu_ras_block_ops umc_v6_1_ras_ops = {
+	.ras_block_match = umc_v6_1_ras_block_match,
 	.ras_late_init = amdgpu_umc_ras_late_init,
 	.ras_fini = amdgpu_umc_ras_fini,
 	.query_ras_error_count = umc_v6_1_query_ras_error_count,
 	.query_ras_error_address = umc_v6_1_query_ras_error_address,
 };
+
+struct amdgpu_umc_ras umc_v6_1_ras = {
+	.ras_block = {
+		.name = "umc",
+		.block = AMDGPU_RAS_BLOCK__UMC,
+		.ops = &umc_v6_1_ras_ops,
+	},
+	.err_cnt_init = umc_v6_1_err_cnt_init,
+};
\ No newline at end of file
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.h b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.h
index 5dc36c730bb2..50c632eb4cc6 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.h
@@ -45,7 +45,7 @@
 /* umc ce count initial value */
 #define UMC_V6_1_CE_CNT_INIT	(UMC_V6_1_CE_CNT_MAX - UMC_V6_1_CE_INT_THRESHOLD)
 
-extern const struct amdgpu_umc_ras_funcs umc_v6_1_ras_funcs;
+extern struct amdgpu_umc_ras umc_v6_1_ras;
 extern const uint32_t
 	umc_v6_1_channel_idx_tbl[UMC_V6_1_UMC_INSTANCE_NUM][UMC_V6_1_CHANNEL_INSTANCE_NUM];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
index f7ec3fe134e5..e26728dbc6e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
@@ -321,10 +321,31 @@ static bool umc_v6_7_query_ras_poison_mode(struct amdgpu_device *adev)
 	return true;
 }
 
-const struct amdgpu_umc_ras_funcs umc_v6_7_ras_funcs = {
+static int umc_v6_7_ras_block_match(struct amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)
+{
+	if(!block_obj)
+		return -EINVAL;
+
+	if(block_obj->block == block) {
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+const struct amdgpu_ras_block_ops umc_v6_7_ras_pos = {
+	.ras_block_match = umc_v6_7_ras_block_match,
 	.ras_late_init = amdgpu_umc_ras_late_init,
 	.ras_fini = amdgpu_umc_ras_fini,
 	.query_ras_error_count = umc_v6_7_query_ras_error_count,
 	.query_ras_error_address = umc_v6_7_query_ras_error_address,
 	.query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
 };
+
+struct amdgpu_umc_ras umc_v6_7_ras = {
+	.ras_block = {
+		.name = "umc",
+		.block = AMDGPU_RAS_BLOCK__UMC,
+		.ops = &umc_v6_7_ras_pos,
+	},
+};
\ No newline at end of file
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
index 57f2557e7aca..1f2edf625370 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
@@ -43,7 +43,7 @@
 #define UMC_V6_7_TOTAL_CHANNEL_NUM	(UMC_V6_7_CHANNEL_INSTANCE_NUM * UMC_V6_7_UMC_INSTANCE_NUM)
 /* UMC regiser per channel offset */
 #define UMC_V6_7_PER_CHANNEL_OFFSET		0x400
-extern const struct amdgpu_umc_ras_funcs umc_v6_7_ras_funcs;
+extern struct amdgpu_umc_ras umc_v6_7_ras;
 extern const uint32_t
 	umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM];
 extern const uint32_t
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c
index af59a35788e3..037791e90c24 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c
@@ -324,10 +324,31 @@ static void umc_v8_7_err_cnt_init(struct amdgpu_device *adev)
 	}
 }
 
-const struct amdgpu_umc_ras_funcs umc_v8_7_ras_funcs = {
-	.err_cnt_init = umc_v8_7_err_cnt_init,
+static int umc_v8_7_ras_block_match(struct amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)
+{
+	if(!block_obj)
+		return -EINVAL;
+
+	if(block_obj->block == block) {
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+const struct amdgpu_ras_block_ops umc_v8_7_ras_ops = {
+	.ras_block_match = umc_v8_7_ras_block_match,
 	.ras_late_init = amdgpu_umc_ras_late_init,
 	.ras_fini = amdgpu_umc_ras_fini,
 	.query_ras_error_count = umc_v8_7_query_ras_error_count,
 	.query_ras_error_address = umc_v8_7_query_ras_error_address,
 };
+
+struct amdgpu_umc_ras umc_v8_7_ras = {
+	.ras_block = {
+		.name = "umc",
+		.block = AMDGPU_RAS_BLOCK__UMC,
+		.ops = &umc_v8_7_ras_ops,
+	},
+	.err_cnt_init = umc_v8_7_err_cnt_init,
+};
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_7.h b/drivers/gpu/drm/amd/amdgpu/umc_v8_7.h
index 37e6dc7c28e0..dd4993f5f78f 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v8_7.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_7.h
@@ -44,7 +44,7 @@
 /* umc ce count initial value */
 #define UMC_V8_7_CE_CNT_INIT	(UMC_V8_7_CE_CNT_MAX - UMC_V8_7_CE_INT_THRESHOLD)
 
-extern const struct amdgpu_umc_ras_funcs umc_v8_7_ras_funcs;
+extern struct amdgpu_umc_ras umc_v8_7_ras;
 extern const uint32_t
 	umc_v8_7_channel_idx_tbl[UMC_V8_7_UMC_INSTANCE_NUM][UMC_V8_7_CHANNEL_INSTANCE_NUM];
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH V2 09/11] drm/amdgpu: Modify sdma block to fit for the unified ras block data and ops
  2021-12-01 10:52 [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras block yipechai
                   ` (6 preceding siblings ...)
  2021-12-01 10:52 ` [PATCH V2 08/11] drm/amdgpu: Modify umc " yipechai
@ 2021-12-01 10:52 ` yipechai
  2021-12-01 10:52 ` [PATCH V2 10/11] drm/amdgpu: Modify mca " yipechai
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 20+ messages in thread
From: yipechai @ 2021-12-01 10:52 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, Hawking.Zhang, yipechai, yipechai

1.Modify sdma block to fit for the unified ras block data and ops.
2.Implement .ras_block_match function pointer for sdma block to identify itself.
3.Change amdgpu_sdma_ras_funcs to amdgpu_sdma_ras, and the corresponding variable name remove _funcs suffix.
4.Remove the const flag of sdma ras variable so that sdma ras block can be able to be insertted into amdgpu device ras block link list.
5.Invoke amdgpu_ras_register_ras_block function to register sdma ras block into amdgpu device ras block link list.
6.Remove the redundant code about sdma in amdgpu_ras.c after using the unified ras block.

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  9 ----
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 13 ++---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 61 +++++++++++++++++++-----
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c   | 40 ++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4.h   |  2 +-
 5 files changed, 92 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7d050afd7e2e..6a145d0e0032 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -939,12 +939,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 			block_obj->ops->query_ras_error_address(adev, &err_data);
 		break;
 	case AMDGPU_RAS_BLOCK__SDMA:
-		if (adev->sdma.funcs->query_ras_error_count) {
-			for (i = 0; i < adev->sdma.num_instances; i++)
-				adev->sdma.funcs->query_ras_error_count(adev, i,
-									&err_data);
-		}
-		break;
 	case AMDGPU_RAS_BLOCK__GFX:
 	case AMDGPU_RAS_BLOCK__MMHUB:
 		if (!block_obj || !block_obj->ops)	{
@@ -1049,9 +1043,6 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
 			block_obj->ops->reset_ras_error_status(adev);
 		break;
 	case AMDGPU_RAS_BLOCK__SDMA:
-		if (adev->sdma.funcs->reset_ras_error_count)
-			adev->sdma.funcs->reset_ras_error_count(adev);
-		break;
 	case AMDGPU_RAS_BLOCK__HDP:
 		if (!block_obj || !block_obj->ops)	{
 			dev_info(adev->dev, "%s don't config ras function \n", ras_block_str(block));
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index f8fb755e3aa6..a0761cf50ae0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -23,6 +23,7 @@
 
 #ifndef __AMDGPU_SDMA_H__
 #define __AMDGPU_SDMA_H__
+#include "amdgpu_ras.h"
 
 /* max number of IP instances */
 #define AMDGPU_MAX_SDMA_INSTANCES		8
@@ -50,13 +51,9 @@ struct amdgpu_sdma_instance {
 	bool			burst_nop;
 };
 
-struct amdgpu_sdma_ras_funcs {
-	int (*ras_late_init)(struct amdgpu_device *adev,
-			void *ras_ih_info);
-	void (*ras_fini)(struct amdgpu_device *adev);
-	int (*query_ras_error_count)(struct amdgpu_device *adev,
-			uint32_t instance, void *ras_error_status);
-	void (*reset_ras_error_count)(struct amdgpu_device *adev);
+struct amdgpu_sdma_ras {
+	struct amdgpu_ras_block_object ras_block;
+	int (*sdma_ras_late_init)(struct amdgpu_device *adev, void *ras_ih_info);
 };
 
 struct amdgpu_sdma {
@@ -73,7 +70,7 @@ struct amdgpu_sdma {
 	uint32_t                    srbm_soft_reset;
 	bool			has_page_queue;
 	struct ras_common_if	*ras_if;
-	const struct amdgpu_sdma_ras_funcs	*funcs;
+	struct amdgpu_sdma_ras	*ras;
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 69c9e460c1eb..30a651613776 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1898,13 +1898,13 @@ static int sdma_v4_0_late_init(void *handle)
 	sdma_v4_0_setup_ulv(adev);
 
 	if (!amdgpu_persistent_edc_harvesting_supported(adev)) {
-		if (adev->sdma.funcs &&
-		    adev->sdma.funcs->reset_ras_error_count)
-			adev->sdma.funcs->reset_ras_error_count(adev);
+		if (adev->sdma.ras && adev->sdma.ras->ras_block.ops &&
+		    adev->sdma.ras->ras_block.ops->reset_ras_error_count)
+			adev->sdma.ras->ras_block.ops->reset_ras_error_count(adev);
 	}
 
-	if (adev->sdma.funcs && adev->sdma.funcs->ras_late_init)
-		return adev->sdma.funcs->ras_late_init(adev, &ih_info);
+	if (adev->sdma.ras && adev->sdma.ras->sdma_ras_late_init)
+		return adev->sdma.ras->sdma_ras_late_init(adev, &ih_info);
 	else
 		return 0;
 }
@@ -2007,8 +2007,9 @@ static int sdma_v4_0_sw_fini(void *handle)
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 	int i;
 
-	if (adev->sdma.funcs && adev->sdma.funcs->ras_fini)
-		adev->sdma.funcs->ras_fini(adev);
+	if (adev->sdma.ras && adev->sdma.ras->ras_block.ops &&
+		adev->sdma.ras->ras_block.ops->ras_fini)
+		adev->sdma.ras->ras_block.ops->ras_fini(adev);
 
 	for (i = 0; i < adev->sdma.num_instances; i++) {
 		amdgpu_ring_fini(&adev->sdma.instance[i].ring);
@@ -2745,7 +2746,7 @@ static void sdma_v4_0_get_ras_error_count(uint32_t value,
 	}
 }
 
-static int sdma_v4_0_query_ras_error_count(struct amdgpu_device *adev,
+static int sdma_v4_0_query_ras_error_count_by_instance(struct amdgpu_device *adev,
 			uint32_t instance, void *ras_error_status)
 {
 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
@@ -2767,6 +2768,18 @@ static int sdma_v4_0_query_ras_error_count(struct amdgpu_device *adev,
 	return 0;
 };
 
+static void sdma_v4_0_query_ras_error_count(struct amdgpu_device *adev,  void *ras_error_status)
+{
+	int i = 0;
+	for (i = 0; i < adev->sdma.num_instances; i++) {
+		if (sdma_v4_0_query_ras_error_count_by_instance(adev, i, ras_error_status))
+		{
+			dev_err(adev->dev, "Query ras error count failed in SDMA%d \n", i);
+			return;
+		}
+	}
+}
+
 static void sdma_v4_0_reset_ras_error_count(struct amdgpu_device *adev)
 {
 	int i;
@@ -2778,26 +2791,50 @@ static void sdma_v4_0_reset_ras_error_count(struct amdgpu_device *adev)
 	}
 }
 
-static const struct amdgpu_sdma_ras_funcs sdma_v4_0_ras_funcs = {
-	.ras_late_init = amdgpu_sdma_ras_late_init,
+static int sdma_v4_0_ras_block_match(struct amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)
+{
+	if(!block_obj)
+		return -EINVAL;
+
+	if(block_obj->block == block) {
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+const struct amdgpu_ras_block_ops sdma_v4_0_ras_ops = {
+	.ras_block_match = sdma_v4_0_ras_block_match,
 	.ras_fini = amdgpu_sdma_ras_fini,
 	.query_ras_error_count = sdma_v4_0_query_ras_error_count,
 	.reset_ras_error_count = sdma_v4_0_reset_ras_error_count,
 };
 
+static struct amdgpu_sdma_ras sdma_v4_0_ras = {
+	.ras_block = {
+		.name = "sdma",
+		.block = AMDGPU_RAS_BLOCK__SDMA,
+		.ops = &sdma_v4_0_ras_ops,
+	},
+	.sdma_ras_late_init = amdgpu_sdma_ras_late_init,
+};
+
 static void sdma_v4_0_set_ras_funcs(struct amdgpu_device *adev)
 {
 	switch (adev->asic_type) {
 	case CHIP_VEGA20:
 	case CHIP_ARCTURUS:
-		adev->sdma.funcs = &sdma_v4_0_ras_funcs;
+		adev->sdma.ras = &sdma_v4_0_ras;
 		break;
 	case CHIP_ALDEBARAN:
-		adev->sdma.funcs = &sdma_v4_4_ras_funcs;
+		adev->sdma.ras = &sdma_v4_4_ras;
 		break;
 	default:
 		break;
 	}
+
+	if(adev->sdma.ras)
+		amdgpu_ras_register_ras_block(adev, &adev->sdma.ras->ras_block);
 }
 
 const struct amdgpu_ip_block_version sdma_v4_0_ip_block = {
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c
index bf95007f0843..8c165bcb0ffa 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c
@@ -188,7 +188,7 @@ static void sdma_v4_4_get_ras_error_count(struct amdgpu_device *adev,
 	}
 }
 
-static int sdma_v4_4_query_ras_error_count(struct amdgpu_device *adev,
+static int sdma_v4_4_query_ras_error_count_by_instance(struct amdgpu_device *adev,
 					   uint32_t instance,
 					   void *ras_error_status)
 {
@@ -245,9 +245,43 @@ static void sdma_v4_4_reset_ras_error_count(struct amdgpu_device *adev)
 	}
 }
 
-const struct amdgpu_sdma_ras_funcs sdma_v4_4_ras_funcs = {
-	.ras_late_init = amdgpu_sdma_ras_late_init,
+static void sdma_v4_4_query_ras_error_count(struct amdgpu_device *adev,  void *ras_error_status)
+{
+	int i = 0;
+	for (i = 0; i < adev->sdma.num_instances; i++) {
+		if (sdma_v4_4_query_ras_error_count_by_instance(adev, i, ras_error_status))
+		{
+			dev_err(adev->dev, "Query ras error count failed in SDMA%d \n", i);
+			return;
+		}
+	}
+
+}
+
+static int sdma_v4_4_ras_block_match(struct amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)
+{
+	if(!block_obj)
+		return -EINVAL;
+
+	if(block_obj->block == block) {
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+const struct amdgpu_ras_block_ops sdma_v4_4_ras_ops = {
+	.ras_block_match = sdma_v4_4_ras_block_match,
 	.ras_fini = amdgpu_sdma_ras_fini,
 	.query_ras_error_count = sdma_v4_4_query_ras_error_count,
 	.reset_ras_error_count = sdma_v4_4_reset_ras_error_count,
 };
+
+struct amdgpu_sdma_ras sdma_v4_4_ras = {
+	.ras_block = {
+		.name = "sdma",
+		.block = AMDGPU_RAS_BLOCK__SDMA,
+		.ops = &sdma_v4_4_ras_ops,
+	},
+	.sdma_ras_late_init = amdgpu_sdma_ras_late_init,
+};
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.h b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.h
index 74a6e5b5e949..a9f0c68359e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.h
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.h
@@ -23,6 +23,6 @@
 #ifndef __SDMA_V4_4_H__
 #define __SDMA_V4_4_H__
 
-extern const struct amdgpu_sdma_ras_funcs sdma_v4_4_ras_funcs;
+extern struct amdgpu_sdma_ras sdma_v4_4_ras;
 
 #endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH V2 10/11] drm/amdgpu: Modify mca block to fit for the unified ras block data and ops
  2021-12-01 10:52 [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras block yipechai
                   ` (7 preceding siblings ...)
  2021-12-01 10:52 ` [PATCH V2 09/11] drm/amdgpu: Modify sdma " yipechai
@ 2021-12-01 10:52 ` yipechai
  2021-12-01 10:52 ` [PATCH V2 11/11] drm/amdgpu: Move error inject function from amdgpu_ras.c to each block yipechai
  2021-12-06  7:36 ` [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras block Zhou1, Tao
  10 siblings, 0 replies; 20+ messages in thread
From: yipechai @ 2021-12-01 10:52 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, Hawking.Zhang, yipechai, yipechai

1.Modify mca block to fit for the unified ras block data and ops.
2.Implement .ras_block_match function pointer for mca block to identify itself.
3.Change amdgpu_mca_ras_funcs to amdgpu_mca_ras_block(amdgpu_mca_ras had been used), and the corresponding variable name remove _funcs suffix.
4.Remove the const flag of cma ras variable so that cma ras block can be able to be insertted into amdgpu device ras block link list.
5.Invoke amdgpu_ras_register_ras_block function to register cma ras block into amdgpu device ras block link list.
6.Remove the redundant code about cma in amdgpu_ras.c after using the unified ras block.

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 18 +++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c |  6 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h | 14 ++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 29 +----------
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.c   | 67 +++++++++++++++++++------
 5 files changed, 68 insertions(+), 66 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index ead143214448..065d98cc028f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -467,23 +467,23 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
 			return r;
 	}
 
-	if (adev->mca.mp0.ras_funcs &&
-	    adev->mca.mp0.ras_funcs->ras_late_init) {
-		r = adev->mca.mp0.ras_funcs->ras_late_init(adev);
+	if (adev->mca.mp0.ras && adev->mca.mp0.ras->ras_block.ops &&
+	    adev->mca.mp0.ras->ras_block.ops->ras_late_init) {
+		r = adev->mca.mp0.ras->ras_block.ops->ras_late_init(adev);
 		if (r)
 			return r;
 	}
 
-	if (adev->mca.mp1.ras_funcs &&
-	    adev->mca.mp1.ras_funcs->ras_late_init) {
-		r = adev->mca.mp1.ras_funcs->ras_late_init(adev);
+	if (adev->mca.mp1.ras && adev->mca.mp1.ras->ras_block.ops &&
+	    adev->mca.mp1.ras->ras_block.ops->ras_late_init) {
+		r = adev->mca.mp1.ras->ras_block.ops->ras_late_init(adev);
 		if (r)
 			return r;
 	}
 
-	if (adev->mca.mpio.ras_funcs &&
-	    adev->mca.mpio.ras_funcs->ras_late_init) {
-		r = adev->mca.mpio.ras_funcs->ras_late_init(adev);
+	if (adev->mca.mpio.ras && adev->mca.mpio.ras->ras_block.ops &&
+	    adev->mca.mpio.ras->ras_block.ops->ras_late_init) {
+		r = adev->mca.mpio.ras->ras_block.ops->ras_late_init(adev);
 		if (r)
 			return r;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index ce538f4819f9..86dbe485a644 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -79,15 +79,15 @@ int amdgpu_mca_ras_late_init(struct amdgpu_device *adev,
 		.cb = NULL,
 	};
 	struct ras_fs_if fs_info = {
-		.sysfs_name = mca_dev->ras_funcs->sysfs_name,
+		.sysfs_name = mca_dev->ras->ras_block.name,
 	};
 
 	if (!mca_dev->ras_if) {
 		mca_dev->ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
 		if (!mca_dev->ras_if)
 			return -ENOMEM;
-		mca_dev->ras_if->block = mca_dev->ras_funcs->ras_block;
-		mca_dev->ras_if->sub_block_index = mca_dev->ras_funcs->ras_sub_block;
+		mca_dev->ras_if->block = mca_dev->ras->ras_block.block;
+		mca_dev->ras_if->sub_block_index = mca_dev->ras->ras_block.sub_block_index;
 		mca_dev->ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
 	}
 	ih_info.head = fs_info.head = *mca_dev->ras_if;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
index c74bc7177066..be030c4031d2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
@@ -21,21 +21,13 @@
 #ifndef __AMDGPU_MCA_H__
 #define __AMDGPU_MCA_H__
 
-struct amdgpu_mca_ras_funcs {
-	int (*ras_late_init)(struct amdgpu_device *adev);
-	void (*ras_fini)(struct amdgpu_device *adev);
-	void (*query_ras_error_count)(struct amdgpu_device *adev,
-				      void *ras_error_status);
-	void (*query_ras_error_address)(struct amdgpu_device *adev,
-					void *ras_error_status);
-	uint32_t ras_block;
-	uint32_t ras_sub_block;
-	const char* sysfs_name;
+struct amdgpu_mca_ras_block {
+	struct amdgpu_ras_block_object ras_block;
 };
 
 struct amdgpu_mca_ras {
 	struct ras_common_if *ras_if;
-	const struct amdgpu_mca_ras_funcs *ras_funcs;
+	struct amdgpu_mca_ras_block *ras;
 };
 
 struct amdgpu_mca_funcs {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6a145d0e0032..2e38bd3d3d45 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -884,31 +884,6 @@ static struct amdgpu_ras_block_object* amdgpu_ras_get_ras_block(struct amdgpu_de
 	return NULL;
 }
 
-void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
-				       struct ras_common_if *ras_block,
-				       struct ras_err_data  *err_data)
-{
-	switch (ras_block->sub_block_index) {
-	case AMDGPU_RAS_MCA_BLOCK__MP0:
-		if (adev->mca.mp0.ras_funcs &&
-		    adev->mca.mp0.ras_funcs->query_ras_error_count)
-			adev->mca.mp0.ras_funcs->query_ras_error_count(adev, &err_data);
-		break;
-	case AMDGPU_RAS_MCA_BLOCK__MP1:
-		if (adev->mca.mp1.ras_funcs &&
-		    adev->mca.mp1.ras_funcs->query_ras_error_count)
-			adev->mca.mp1.ras_funcs->query_ras_error_count(adev, &err_data);
-		break;
-	case AMDGPU_RAS_MCA_BLOCK__MPIO:
-		if (adev->mca.mpio.ras_funcs &&
-		    adev->mca.mpio.ras_funcs->query_ras_error_count)
-			adev->mca.mpio.ras_funcs->query_ras_error_count(adev, &err_data);
-		break;
-	default:
-		break;
-	}
-}
-
 /* query/inject/cure begin */
 int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 				  struct ras_query_if *info)
@@ -956,6 +931,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 	case AMDGPU_RAS_BLOCK__PCIE_BIF:
 	case AMDGPU_RAS_BLOCK__XGMI_WAFL:
 	case AMDGPU_RAS_BLOCK__HDP:
+	case AMDGPU_RAS_BLOCK__MCA:
 		if (!block_obj || !block_obj->ops)	{
 			dev_info(adev->dev, "%s don't config ras function \n",
 				get_ras_block_str(&info->head));
@@ -964,9 +940,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 		if (block_obj->ops->query_ras_error_count)
 			block_obj->ops->query_ras_error_count(adev, &err_data);
 		break;
-	case AMDGPU_RAS_BLOCK__MCA:
-		amdgpu_ras_mca_query_error_status(adev, &info->head, &err_data);
-		break;
 	default:
 		break;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
index 8f7107d392af..99edc75ed4ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
@@ -47,14 +47,34 @@ static void mca_v3_0_mp0_ras_fini(struct amdgpu_device *adev)
 	amdgpu_mca_ras_fini(adev, &adev->mca.mp0);
 }
 
-const struct amdgpu_mca_ras_funcs mca_v3_0_mp0_ras_funcs = {
+static int mca_v3_0_ras_block_match(struct amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)
+{
+	if(!block_obj)
+		return -EINVAL;
+
+	if( (block_obj->block == block) &&
+		(block_obj->sub_block_index == sub_block_index)) {
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+const struct amdgpu_ras_block_ops mca_v3_0_mp0_ops = {
+	.ras_block_match = mca_v3_0_ras_block_match,
 	.ras_late_init = mca_v3_0_mp0_ras_late_init,
 	.ras_fini = mca_v3_0_mp0_ras_fini,
 	.query_ras_error_count = mca_v3_0_mp0_query_ras_error_count,
 	.query_ras_error_address = NULL,
-	.ras_block = AMDGPU_RAS_BLOCK__MCA,
-	.ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MP0,
-	.sysfs_name = "mp0_err_count",
+};
+
+struct amdgpu_mca_ras_block mca_v3_0_mp0_ras = {
+	.ras_block = {
+		.ops = &mca_v3_0_mp0_ops,
+		.block = AMDGPU_RAS_BLOCK__MCA,
+		.sub_block_index = AMDGPU_RAS_MCA_BLOCK__MP0,
+		.name = "mp0_err_count",
+	},
 };
 
 static void mca_v3_0_mp1_query_ras_error_count(struct amdgpu_device *adev,
@@ -75,14 +95,21 @@ static void mca_v3_0_mp1_ras_fini(struct amdgpu_device *adev)
 	amdgpu_mca_ras_fini(adev, &adev->mca.mp1);
 }
 
-const struct amdgpu_mca_ras_funcs mca_v3_0_mp1_ras_funcs = {
+const struct amdgpu_ras_block_ops mca_v3_0_mp1_ops = {
+	.ras_block_match = mca_v3_0_ras_block_match,
 	.ras_late_init = mca_v3_0_mp1_ras_late_init,
 	.ras_fini = mca_v3_0_mp1_ras_fini,
 	.query_ras_error_count = mca_v3_0_mp1_query_ras_error_count,
 	.query_ras_error_address = NULL,
-	.ras_block = AMDGPU_RAS_BLOCK__MCA,
-	.ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MP1,
-	.sysfs_name = "mp1_err_count",
+};
+
+struct amdgpu_mca_ras_block mca_v3_0_mp1_ras = {
+	.ras_block = {
+		.ops = &mca_v3_0_mp1_ops,
+		.block = AMDGPU_RAS_BLOCK__MCA,
+		.sub_block_index = AMDGPU_RAS_MCA_BLOCK__MP1,
+		.name = "mp1_err_count",
+	},
 };
 
 static void mca_v3_0_mpio_query_ras_error_count(struct amdgpu_device *adev,
@@ -103,14 +130,21 @@ static void mca_v3_0_mpio_ras_fini(struct amdgpu_device *adev)
 	amdgpu_mca_ras_fini(adev, &adev->mca.mpio);
 }
 
-const struct amdgpu_mca_ras_funcs mca_v3_0_mpio_ras_funcs = {
+const struct amdgpu_ras_block_ops mca_v3_0_mpio_ops = {
+	.ras_block_match = mca_v3_0_ras_block_match,
 	.ras_late_init = mca_v3_0_mpio_ras_late_init,
 	.ras_fini = mca_v3_0_mpio_ras_fini,
 	.query_ras_error_count = mca_v3_0_mpio_query_ras_error_count,
 	.query_ras_error_address = NULL,
-	.ras_block = AMDGPU_RAS_BLOCK__MCA,
-	.ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MPIO,
-	.sysfs_name = "mpio_err_count",
+};
+
+struct amdgpu_mca_ras_block mca_v3_0_mpio_ras = {
+	.ras_block = {
+		.ops = &mca_v3_0_mpio_ops,
+		.block = AMDGPU_RAS_BLOCK__MCA,
+		.sub_block_index = AMDGPU_RAS_MCA_BLOCK__MPIO,
+		.name = "mpio_err_count",
+	},
 };
 
 
@@ -118,9 +152,12 @@ static void mca_v3_0_init(struct amdgpu_device *adev)
 {
 	struct amdgpu_mca *mca = &adev->mca;
 
-	mca->mp0.ras_funcs = &mca_v3_0_mp0_ras_funcs;
-	mca->mp1.ras_funcs = &mca_v3_0_mp1_ras_funcs;
-	mca->mpio.ras_funcs = &mca_v3_0_mpio_ras_funcs;
+	mca->mp0.ras = &mca_v3_0_mp0_ras;
+	mca->mp1.ras = &mca_v3_0_mp1_ras;
+	mca->mpio.ras = &mca_v3_0_mpio_ras;
+	amdgpu_ras_register_ras_block(adev, &mca->mp0.ras->ras_block);
+	amdgpu_ras_register_ras_block(adev, &mca->mp1.ras->ras_block);
+	amdgpu_ras_register_ras_block(adev, &mca->mpio.ras->ras_block);
 }
 
 const struct amdgpu_mca_funcs mca_v3_0_funcs = {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH V2 11/11] drm/amdgpu: Move error inject function from amdgpu_ras.c to each block
  2021-12-01 10:52 [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras block yipechai
                   ` (8 preceding siblings ...)
  2021-12-01 10:52 ` [PATCH V2 10/11] drm/amdgpu: Modify mca " yipechai
@ 2021-12-01 10:52 ` yipechai
  2021-12-06  7:33   ` Zhou1, Tao
  2021-12-06  7:36 ` [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras block Zhou1, Tao
  10 siblings, 1 reply; 20+ messages in thread
From: yipechai @ 2021-12-01 10:52 UTC (permalink / raw)
  To: amd-gfx; +Cc: Tao.Zhou1, Hawking.Zhang, yipechai, yipechai

Move each block error inject function from amdgpu_ras.c to each block.

Signed-off-by: yipechai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 62 +++++-------------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 28 +++++++++++
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.c    | 18 +++++++
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c  | 16 ++++++
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c  | 16 ++++++
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c  | 16 ++++++
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c   | 16 ++++++
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 16 ++++++
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c   | 16 ++++++
 drivers/gpu/drm/amd/amdgpu/umc_v6_1.c    | 16 ++++++
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.c    | 16 ++++++
 drivers/gpu/drm/amd/amdgpu/umc_v8_7.c    | 16 ++++++
 12 files changed, 201 insertions(+), 51 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2e38bd3d3d45..87b625d305c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1032,31 +1032,7 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
 	return 0;
 }
 
-/* Trigger XGMI/WAFL error */
-static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
-				 struct ta_ras_trigger_error_input *block_info)
-{
-	int ret;
-
-	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
-		dev_warn(adev->dev, "Failed to disallow df cstate");
 
-	if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
-		dev_warn(adev->dev, "Failed to disallow XGMI power down");
-
-	ret = psp_ras_trigger_error(&adev->psp, block_info);
-
-	if (amdgpu_ras_intr_triggered())
-		return ret;
-
-	if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
-		dev_warn(adev->dev, "Failed to allow XGMI power down");
-
-	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
-		dev_warn(adev->dev, "Failed to allow df cstate");
-
-	return ret;
-}
 
 /* wrapper of psp_ras_trigger_error */
 int amdgpu_ras_error_inject(struct amdgpu_device *adev,
@@ -1076,41 +1052,25 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 	if (!obj)
 		return -EINVAL;
 
+	if (!block_obj || !block_obj->ops)	{
+		dev_info(adev->dev, "%s don't config ras function \n", get_ras_block_str(&info->head));
+		return -EINVAL;
+	}
+
 	/* Calculate XGMI relative offset */
 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
-		block_info.address =
-			amdgpu_xgmi_get_relative_phy_addr(adev,
-							  block_info.address);
+		block_info.address =  amdgpu_xgmi_get_relative_phy_addr(adev, block_info.address);
 	}
 
-	switch (info->head.block) {
-	case AMDGPU_RAS_BLOCK__GFX:
-		if (!block_obj || !block_obj->ops)	{
-			dev_info(adev->dev, "%s don't config ras function \n", get_ras_block_str(&info->head));
-			return -EINVAL;
-		}
-		if (block_obj->ops->ras_error_inject)
+	if (block_obj->ops->ras_error_inject) {
+		if(info->head.block == AMDGPU_RAS_BLOCK__GFX)
 			ret = block_obj->ops->ras_error_inject(adev, info);
-		break;
-	case AMDGPU_RAS_BLOCK__UMC:
-	case AMDGPU_RAS_BLOCK__SDMA:
-	case AMDGPU_RAS_BLOCK__MMHUB:
-	case AMDGPU_RAS_BLOCK__PCIE_BIF:
-	case AMDGPU_RAS_BLOCK__MCA:
-		ret = psp_ras_trigger_error(&adev->psp, &block_info);
-		break;
-	case AMDGPU_RAS_BLOCK__XGMI_WAFL:
-		ret = amdgpu_ras_error_inject_xgmi(adev, &block_info);
-		break;
-	default:
-		dev_info(adev->dev, "%s error injection is not supported yet\n",
-			 get_ras_block_str(&info->head));
-		ret = -EINVAL;
+		else
+			ret = block_obj->ops->ras_error_inject(adev, &block_info);
 	}
 
 	if (ret)
-		dev_err(adev->dev, "ras inject %s failed %d\n",
-			get_ras_block_str(&info->head), ret);
+		dev_err(adev->dev, "ras inject %s failed %d\n", get_ras_block_str(&info->head), ret);
 
 	return ret;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index da541c7b1ec2..298742afba99 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -940,6 +940,33 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
 	err_data->ce_count += ce_cnt;
 }
 
+/* Trigger XGMI/WAFL error */
+static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
+                                void *inject_if)
+{
+       int ret = 0;;
+       struct ta_ras_trigger_error_input *block_info =  (struct ta_ras_trigger_error_input *)inject_if;
+
+       if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
+               dev_warn(adev->dev, "Failed to disallow df cstate");
+
+       if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
+               dev_warn(adev->dev, "Failed to disallow XGMI power down");
+
+       ret = psp_ras_trigger_error(&adev->psp, block_info);
+
+       if (amdgpu_ras_intr_triggered())
+               return ret;
+
+       if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
+               dev_warn(adev->dev, "Failed to allow XGMI power down");
+
+       if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
+               dev_warn(adev->dev, "Failed to allow df cstate");
+
+       return ret;
+}
+
 static int amdgpu_xgmi_ras_block_match(struct amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)
 {
 	if(!block_obj)
@@ -958,6 +985,7 @@ struct amdgpu_ras_block_ops  xgmi_ras_ops = {
 	.ras_fini = amdgpu_xgmi_ras_fini,
 	.query_ras_error_count = amdgpu_xgmi_query_ras_error_count,
 	.reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count,
+	.ras_error_inject = amdgpu_ras_error_inject_xgmi,
 };
 
 struct amdgpu_xgmi_ras xgmi_ras = {
diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
index 99edc75ed4ec..ce6841967b05 100644
--- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
@@ -60,12 +60,28 @@ static int mca_v3_0_ras_block_match(struct amdgpu_ras_block_object* block_obj, e
 	return -EINVAL;
 }
 
+static int mca_v3_0_ras_error_inject(struct amdgpu_device *adev, void *inject_if)
+{
+	int ret = 0;
+	if (!adev || !inject_if) {
+		dev_err(adev->dev, "%s invaild parameters \n", __func__);
+		return -EINVAL;
+	}
+
+	mutex_lock(&adev->grbm_idx_mutex);
+	ret = psp_ras_trigger_error(&adev->psp, (struct ta_ras_trigger_error_input *)inject_if);
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return ret;
+}
+
 const struct amdgpu_ras_block_ops mca_v3_0_mp0_ops = {
 	.ras_block_match = mca_v3_0_ras_block_match,
 	.ras_late_init = mca_v3_0_mp0_ras_late_init,
 	.ras_fini = mca_v3_0_mp0_ras_fini,
 	.query_ras_error_count = mca_v3_0_mp0_query_ras_error_count,
 	.query_ras_error_address = NULL,
+	.ras_error_inject = mca_v3_0_ras_error_inject,
 };
 
 struct amdgpu_mca_ras_block mca_v3_0_mp0_ras = {
@@ -101,6 +117,7 @@ const struct amdgpu_ras_block_ops mca_v3_0_mp1_ops = {
 	.ras_fini = mca_v3_0_mp1_ras_fini,
 	.query_ras_error_count = mca_v3_0_mp1_query_ras_error_count,
 	.query_ras_error_address = NULL,
+	.ras_error_inject = mca_v3_0_ras_error_inject,
 };
 
 struct amdgpu_mca_ras_block mca_v3_0_mp1_ras = {
@@ -136,6 +153,7 @@ const struct amdgpu_ras_block_ops mca_v3_0_mpio_ops = {
 	.ras_fini = mca_v3_0_mpio_ras_fini,
 	.query_ras_error_count = mca_v3_0_mpio_query_ras_error_count,
 	.query_ras_error_address = NULL,
+	.ras_error_inject = mca_v3_0_ras_error_inject,
 };
 
 struct amdgpu_mca_ras_block mca_v3_0_mpio_ras = {
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
index da505314802a..7cca86c504e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
@@ -786,12 +786,28 @@ static int mmhub_v1_0_ras_block_match(struct amdgpu_ras_block_object* block_obj,
 	return -EINVAL;
 }
 
+static int mmhub_v1_0_ras_error_inject(struct amdgpu_device *adev, void *inject_if)
+{
+	int ret = 0;
+	if (!adev || !inject_if) {
+		dev_err(adev->dev, "%s invaild parameters \n", __func__);
+		return -EINVAL;
+	}
+
+	mutex_lock(&adev->grbm_idx_mutex);
+	ret = psp_ras_trigger_error(&adev->psp, (struct ta_ras_trigger_error_input *)inject_if);
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return ret;
+}
+
 struct amdgpu_ras_block_ops mmhub_v1_0_ras_ops = {
 	.ras_block_match = mmhub_v1_0_ras_block_match,
 	.ras_late_init = amdgpu_mmhub_ras_late_init,
 	.ras_fini = amdgpu_mmhub_ras_fini,
 	.query_ras_error_count = mmhub_v1_0_query_ras_error_count,
 	.reset_ras_error_count = mmhub_v1_0_reset_ras_error_count,
+	.ras_error_inject = mmhub_v1_0_ras_error_inject,
 };
 
 struct amdgpu_mmhub_ras mmhub_v1_0_ras = {
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
index 829d14ee87d3..79a9995caef1 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
@@ -1333,6 +1333,21 @@ static int mmhub_v1_7_ras_block_match(struct amdgpu_ras_block_object* block_obj,
 	return -EINVAL;
 }
 
+static int mmhub_v1_7_ras_error_inject(struct amdgpu_device *adev, void *inject_if)
+{
+	int ret = 0;
+	if (!adev || !inject_if) {
+		dev_err(adev->dev, "%s invaild parameters \n", __func__);
+		return -EINVAL;
+	}
+
+	mutex_lock(&adev->grbm_idx_mutex);
+	ret = psp_ras_trigger_error(&adev->psp, (struct ta_ras_trigger_error_input *)inject_if);
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return ret;
+}
+
 struct amdgpu_ras_block_ops mmhub_v1_7_ras_ops = {
 	.ras_block_match = mmhub_v1_7_ras_block_match,
 	.ras_late_init = amdgpu_mmhub_ras_late_init,
@@ -1341,6 +1356,7 @@ struct amdgpu_ras_block_ops mmhub_v1_7_ras_ops = {
 	.reset_ras_error_count = mmhub_v1_7_reset_ras_error_count,
 	.query_ras_error_status = mmhub_v1_7_query_ras_error_status,
 	.reset_ras_error_status = mmhub_v1_7_reset_ras_error_status,
+	.ras_error_inject = mmhub_v1_7_ras_error_inject,
 };
 
 struct amdgpu_mmhub_ras mmhub_v1_7_ras = {
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
index 1edc98e5bcbb..eaed556b9551 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
@@ -1667,6 +1667,21 @@ static int mmhub_v9_4_ras_block_match(struct amdgpu_ras_block_object* block_obj,
 	return -EINVAL;
 }
 
+static int mmhub_v9_4_ras_error_inject(struct amdgpu_device *adev, void *inject_if)
+{
+	int ret = 0;
+	if (!adev || !inject_if) {
+		dev_err(adev->dev, "%s invaild parameters \n", __func__);
+		return -EINVAL;
+	}
+
+	mutex_lock(&adev->grbm_idx_mutex);
+	ret = psp_ras_trigger_error(&adev->psp, (struct ta_ras_trigger_error_input *)inject_if);
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return ret;
+}
+
 const struct amdgpu_ras_block_ops mmhub_v9_4_ras_ops = {
 	.ras_block_match = mmhub_v9_4_ras_block_match,
 	.ras_late_init = amdgpu_mmhub_ras_late_init,
@@ -1674,6 +1689,7 @@ const struct amdgpu_ras_block_ops mmhub_v9_4_ras_ops = {
 	.query_ras_error_count = mmhub_v9_4_query_ras_error_count,
 	.reset_ras_error_count = mmhub_v9_4_reset_ras_error_count,
 	.query_ras_error_status = mmhub_v9_4_query_ras_error_status,
+	.ras_error_inject = mmhub_v9_4_ras_error_inject,
 };
 
 struct amdgpu_mmhub_ras mmhub_v9_4_ras = {
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index 14f7265d954e..8e62e2ffabe5 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -650,11 +650,27 @@ static int nbio_v7_4_ras_block_match(struct amdgpu_ras_block_object* block_obj,
 	return -EINVAL;
 }
 
+static int nbio_v7_4_ras_error_inject(struct amdgpu_device *adev, void *inject_if)
+{
+	int ret = 0;
+	if (!adev || !inject_if) {
+		dev_err(adev->dev, "%s invaild parameters \n", __func__);
+		return -EINVAL;
+	}
+
+	mutex_lock(&adev->grbm_idx_mutex);
+	ret = psp_ras_trigger_error(&adev->psp, (struct ta_ras_trigger_error_input *)inject_if);
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return ret;
+}
+
 const struct amdgpu_ras_block_ops nbio_v7_4_ras_ops = {
 	.ras_block_match = nbio_v7_4_ras_block_match,
 	.query_ras_error_count = nbio_v7_4_query_ras_error_count,
 	.ras_late_init = amdgpu_nbio_ras_late_init,
 	.ras_fini = amdgpu_nbio_ras_fini,
+	.ras_error_inject = nbio_v7_4_ras_error_inject,
 };
 
 struct amdgpu_nbio_ras nbio_v7_4_ras = {
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 30a651613776..578ee40cc0d1 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -2803,11 +2803,27 @@ static int sdma_v4_0_ras_block_match(struct amdgpu_ras_block_object* block_obj,
 	return -EINVAL;
 }
 
+static int sdma_v4_0_ras_error_inject(struct amdgpu_device *adev, void *inject_if)
+{
+	int ret = 0;
+	if (!adev || !inject_if) {
+		dev_err(adev->dev, "%s invaild parameters \n", __func__);
+		return -EINVAL;
+	}
+
+	mutex_lock(&adev->grbm_idx_mutex);
+	ret = psp_ras_trigger_error(&adev->psp, (struct ta_ras_trigger_error_input *)inject_if);
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return ret;
+}
+
 const struct amdgpu_ras_block_ops sdma_v4_0_ras_ops = {
 	.ras_block_match = sdma_v4_0_ras_block_match,
 	.ras_fini = amdgpu_sdma_ras_fini,
 	.query_ras_error_count = sdma_v4_0_query_ras_error_count,
 	.reset_ras_error_count = sdma_v4_0_reset_ras_error_count,
+	.ras_error_inject = sdma_v4_0_ras_error_inject,
 };
 
 static struct amdgpu_sdma_ras sdma_v4_0_ras = {
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c
index 8c165bcb0ffa..0656c6a7a2c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c
@@ -270,11 +270,27 @@ static int sdma_v4_4_ras_block_match(struct amdgpu_ras_block_object* block_obj,
 	return -EINVAL;
 }
 
+static int sdma_v4_4_ras_error_inject(struct amdgpu_device *adev, void *inject_if)
+{
+	int ret = 0;
+	if (!adev || !inject_if) {
+		dev_err(adev->dev, "%s invaild parameters \n", __func__);
+		return -EINVAL;
+	}
+
+	mutex_lock(&adev->grbm_idx_mutex);
+	ret = psp_ras_trigger_error(&adev->psp, (struct ta_ras_trigger_error_input *)inject_if);
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return ret;
+}
+
 const struct amdgpu_ras_block_ops sdma_v4_4_ras_ops = {
 	.ras_block_match = sdma_v4_4_ras_block_match,
 	.ras_fini = amdgpu_sdma_ras_fini,
 	.query_ras_error_count = sdma_v4_4_query_ras_error_count,
 	.reset_ras_error_count = sdma_v4_4_reset_ras_error_count,
+	.ras_error_inject = sdma_v4_4_ras_error_inject,
 };
 
 struct amdgpu_sdma_ras sdma_v4_4_ras = {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
index ed480c2081a6..2058439b02cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
@@ -477,12 +477,28 @@ static int umc_v6_1_ras_block_match(struct amdgpu_ras_block_object* block_obj, e
 	return -EINVAL;
 }
 
+static int umc_v6_1_ras_error_inject(struct amdgpu_device *adev, void *inject_if)
+{
+	int ret = 0;
+	if (!adev || !inject_if) {
+		dev_err(adev->dev, "%s invaild parameters \n", __func__);
+		return -EINVAL;
+	}
+
+	mutex_lock(&adev->grbm_idx_mutex);
+	ret = psp_ras_trigger_error(&adev->psp, (struct ta_ras_trigger_error_input *)inject_if);
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return ret;
+}
+
 const struct amdgpu_ras_block_ops umc_v6_1_ras_ops = {
 	.ras_block_match = umc_v6_1_ras_block_match,
 	.ras_late_init = amdgpu_umc_ras_late_init,
 	.ras_fini = amdgpu_umc_ras_fini,
 	.query_ras_error_count = umc_v6_1_query_ras_error_count,
 	.query_ras_error_address = umc_v6_1_query_ras_error_address,
+	.ras_error_inject = umc_v6_1_ras_error_inject,
 };
 
 struct amdgpu_umc_ras umc_v6_1_ras = {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
index e26728dbc6e9..2e87e7de4a55 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
@@ -333,6 +333,21 @@ static int umc_v6_7_ras_block_match(struct amdgpu_ras_block_object* block_obj, e
 	return -EINVAL;
 }
 
+static int umc_v6_7_ras_error_inject(struct amdgpu_device *adev, void *inject_if)
+{
+	int ret = 0;
+	if (!adev || !inject_if) {
+		dev_err(adev->dev, "%s invaild parameters \n", __func__);
+		return -EINVAL;
+	}
+
+	mutex_lock(&adev->grbm_idx_mutex);
+	ret = psp_ras_trigger_error(&adev->psp, (struct ta_ras_trigger_error_input *)inject_if);
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return ret;
+}
+
 const struct amdgpu_ras_block_ops umc_v6_7_ras_pos = {
 	.ras_block_match = umc_v6_7_ras_block_match,
 	.ras_late_init = amdgpu_umc_ras_late_init,
@@ -340,6 +355,7 @@ const struct amdgpu_ras_block_ops umc_v6_7_ras_pos = {
 	.query_ras_error_count = umc_v6_7_query_ras_error_count,
 	.query_ras_error_address = umc_v6_7_query_ras_error_address,
 	.query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
+	.ras_error_inject = umc_v6_7_ras_error_inject,
 };
 
 struct amdgpu_umc_ras umc_v6_7_ras = {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c
index 037791e90c24..f7fb653434b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c
@@ -336,12 +336,28 @@ static int umc_v8_7_ras_block_match(struct amdgpu_ras_block_object* block_obj, e
 	return -EINVAL;
 }
 
+static int umc_v8_7_ras_error_inject(struct amdgpu_device *adev, void *inject_if)
+{
+	int ret = 0;
+	if (!adev || !inject_if) {
+		dev_err(adev->dev, "%s invaild parameters \n", __func__);
+		return -EINVAL;
+	}
+
+	mutex_lock(&adev->grbm_idx_mutex);
+	ret = psp_ras_trigger_error(&adev->psp, (struct ta_ras_trigger_error_input *)inject_if);
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return ret;
+}
+
 const struct amdgpu_ras_block_ops umc_v8_7_ras_ops = {
 	.ras_block_match = umc_v8_7_ras_block_match,
 	.ras_late_init = amdgpu_umc_ras_late_init,
 	.ras_fini = amdgpu_umc_ras_fini,
 	.query_ras_error_count = umc_v8_7_query_ras_error_count,
 	.query_ras_error_address = umc_v8_7_query_ras_error_address,
+	.ras_error_inject = umc_v8_7_ras_error_inject,
 };
 
 struct amdgpu_umc_ras umc_v8_7_ras = {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH V2 02/11] drm/amdgpu: Modify the compilation failed problem when other ras blocks' .h include amdgpu_ras.h
  2021-12-01 10:52 ` [PATCH V2 02/11] drm/amdgpu: Modify the compilation failed problem when other ras blocks' .h include amdgpu_ras.h yipechai
@ 2021-12-06  6:56   ` Zhou1, Tao
  2021-12-07  2:31     ` Chai, Thomas
  0 siblings, 1 reply; 20+ messages in thread
From: Zhou1, Tao @ 2021-12-06  6:56 UTC (permalink / raw)
  To: Chai, Thomas, amd-gfx; +Cc: Zhang, Hawking

[AMD Official Use Only]



> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Wednesday, December 1, 2021 6:53 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Chai,
> Thomas <YiPeng.Chai@amd.com>
> Subject: [PATCH V2 02/11] drm/amdgpu: Modify the compilation failed problem
> when other ras blocks' .h include amdgpu_ras.h
> 
> Modify the compilation failed problem when other ras blocks' .h include

[Tao] 'Fix' is better than "Modify" here.

> amdgpu_ras.h.
> 
> Signed-off-by: yipechai <YiPeng.Chai@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 ++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 23 ++++-------------------
>  2 files changed, 26 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 8713575c7cf1..1cf1f6331db1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2739,6 +2739,28 @@ static void
> amdgpu_register_bad_pages_mca_notifier(void)
>          }
>  }
>  #endif
> +
> +/* check if ras is supported on block, say, sdma, gfx */ int
> +amdgpu_ras_is_supported(struct amdgpu_device *adev,
> +		unsigned int block)
> +{
> +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> +	if (block >= AMDGPU_RAS_BLOCK_COUNT)
> +		return 0;
> +	return ras && (adev->ras_enabled & (1 << block)); }
> +
> +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) {
> +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> +	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
> +		schedule_work(&ras->recovery_work);
> +	return 0;
> +}
> +
> +
>  /* Rigister each ip ras block into amdgpu ras */  int
> amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
>  		struct amdgpu_ras_block_object* ras_block_obj) diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index d6e5e3c862bd..41623a649fa1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -514,16 +514,6 @@ struct amdgpu_ras_block_ops {
>  #define amdgpu_ras_get_context(adev)		((adev)->psp.ras_context.ras)
>  #define amdgpu_ras_set_context(adev, ras_con)	((adev)-
> >psp.ras_context.ras = (ras_con))
> 
> -/* check if ras is supported on block, say, sdma, gfx */ -static inline int
> amdgpu_ras_is_supported(struct amdgpu_device *adev,
> -		unsigned int block)
> -{
> -	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> -
> -	if (block >= AMDGPU_RAS_BLOCK_COUNT)
> -		return 0;
> -	return ras && (adev->ras_enabled & (1 << block));
> -}
> 
>  int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
> 
> @@ -540,15 +530,6 @@ int amdgpu_ras_add_bad_pages(struct
> amdgpu_device *adev,
> 
>  int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
> 
> -static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) -{
> -	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> -
> -	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
> -		schedule_work(&ras->recovery_work);
> -	return 0;
> -}
> -
>  static inline enum ta_ras_block
>  amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
>  	switch (block) {
> @@ -680,5 +661,9 @@ const char *get_ras_block_str(struct ras_common_if
> *ras_block);
> 
>  bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev);
> 
> +int amdgpu_ras_is_supported(struct amdgpu_device *adev,	unsigned int
> block);
> +
> +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev);
> +
>  int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct
> amdgpu_ras_block_object* ras_block_obj);  #endif
> --
> 2.25.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the unified ras block data and ops
  2021-12-01 10:52 ` [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the unified ras block data and ops yipechai
@ 2021-12-06  6:58   ` Zhou1, Tao
  2021-12-07  3:37     ` Chai, Thomas
  0 siblings, 1 reply; 20+ messages in thread
From: Zhou1, Tao @ 2021-12-06  6:58 UTC (permalink / raw)
  To: Chai, Thomas, amd-gfx; +Cc: Zhang, Hawking

[AMD Official Use Only]

Please see my comments inline.

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Wednesday, December 1, 2021 6:53 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Chai,
> Thomas <YiPeng.Chai@amd.com>
> Subject: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the unified
> ras block data and ops
> 
> 1.Modify gfx block to fit for the unified ras block data and ops
> 2.Implement .ras_block_match function pointer for gfx block to identify itself.
> 3.Change amdgpu_gfx_ras_funcs to amdgpu_gfx_ras, and the corresponding
> variable name remove _funcs suffix.
> 4.Remove the const flag of gfx ras variable so that gfx ras block can be able to
> be insertted into amdgpu device ras block link list.
> 5.Invoke amdgpu_ras_register_ras_block function to register gfx ras block into
> amdgpu device ras block link list.
> 6.Remove the redundant code about gfx in amdgpu_ras.c after using the unified
> ras block.
> 
> Signed-off-by: yipechai <YiPeng.Chai@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  6 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 15 ++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 80 ++++++++++++++++++-------
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 73 +++++++++++++++-------
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c   | 39 ++++++++----
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h   |  2 +-
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 42 +++++++++----
> drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h |  2 +-
>  8 files changed, 178 insertions(+), 81 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 1795d448c700..da8691259ac1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -696,9 +696,9 @@ int amdgpu_gfx_process_ras_data_cb(struct
> amdgpu_device *adev,
>  	 */
>  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
>  		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->query_ras_error_count)
> -			adev->gfx.ras_funcs->query_ras_error_count(adev,
> err_data);
> +		if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
> +		    adev->gfx.ras->ras_block.ops->query_ras_error_count)
> +			adev->gfx.ras->ras_block.ops-
> >query_ras_error_count(adev, err_data);
>  		amdgpu_ras_reset_gpu(adev);
>  	}
>  	return AMDGPU_RAS_SUCCESS;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 6b78b4a0e182..ff4a8428a84b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -31,6 +31,7 @@
>  #include "amdgpu_ring.h"
>  #include "amdgpu_rlc.h"
>  #include "soc15.h"
> +#include "amdgpu_ras.h"
> 
>  /* GFX current status */
>  #define AMDGPU_GFX_NORMAL_MODE			0x00000000L
> @@ -213,16 +214,8 @@ struct amdgpu_cu_info {
>  	uint32_t bitmap[4][4];
>  };
> 
> -struct amdgpu_gfx_ras_funcs {
> -	int (*ras_late_init)(struct amdgpu_device *adev);
> -	void (*ras_fini)(struct amdgpu_device *adev);
> -	int (*ras_error_inject)(struct amdgpu_device *adev,
> -				void *inject_if);
> -	int (*query_ras_error_count)(struct amdgpu_device *adev,
> -				     void *ras_error_status);
> -	void (*reset_ras_error_count)(struct amdgpu_device *adev);
> -	void (*query_ras_error_status)(struct amdgpu_device *adev);
> -	void (*reset_ras_error_status)(struct amdgpu_device *adev);
> +struct amdgpu_gfx_ras {
> +	struct amdgpu_ras_block_object  ras_block;
>  	void (*enable_watchdog_timer)(struct amdgpu_device *adev);  };

[Tao] Can we add " enable_watchdog_timer" function into amdgpu_ras_block_ops structure?
And I think using ras_block directly is more simple than amdgpu_gfx_ras gfx_v9_0_ras structure.

> 
> @@ -348,7 +341,7 @@ struct amdgpu_gfx {
> 
>  	/*ras */
>  	struct ras_common_if			*ras_if;
> -	const struct amdgpu_gfx_ras_funcs	*ras_funcs;
> +	struct amdgpu_gfx_ras	*ras;
>  };
> 
>  #define amdgpu_gfx_get_gpu_clock_counter(adev) (adev)->gfx.funcs-
> >get_gpu_clock_counter((adev))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 1cf1f6331db1..190a4a4e9d7a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -862,6 +862,27 @@ static int amdgpu_ras_enable_all_features(struct
> amdgpu_device *adev,  }
>  /* feature ctl end */
> 
> +static struct amdgpu_ras_block_object* amdgpu_ras_get_ras_block(struct
> amdgpu_device *adev,
> +					enum amdgpu_ras_block block,
> uint32_t sub_block_index) {
> +	struct amdgpu_ras_block_object *obj, *tmp;
> +
> +	if (block >= AMDGPU_RAS_BLOCK__LAST) {
> +		return NULL;
> +	}
[Tao] The "{}" can be dropped since only one line under the if.

> +
> +	list_for_each_entry_safe(obj, tmp, &adev->ras_list, node) {
> +		if( !obj->ops || !obj->ops->ras_block_match) {
[Tao]  Need a space after "if" and the space before "!obj" can be removed. 

> +			dev_info(adev->dev, "%s don't config ops or
> ras_block_match\n", obj->name);
> +			continue;
> +		}
> +		if (!obj->ops->ras_block_match(obj, block, sub_block_index)) {
> +			return obj;
> +		}
[Tao] The "{}" can be removed.

> +	}
> +
> +	return NULL;
> +}
[Tao] This is a generic ras function, not gfx specific, the code can be moved to patch #1.

> 
>  void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
>  				       struct ras_common_if *ras_block, @@ -
> 892,6 +913,7 @@ void amdgpu_ras_mca_query_error_status(struct
> amdgpu_device *adev,  int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev,
>  				  struct ras_query_if *info)
>  {
> +	struct amdgpu_ras_block_object* block_obj = NULL;
>  	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
>  	struct ras_err_data err_data = {0, 0, 0, NULL};
>  	int i;
> @@ -899,6 +921,8 @@ int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev,
>  	if (!obj)
>  		return -EINVAL;
> 
> +	block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
> +
>  	switch (info->head.block) {
>  	case AMDGPU_RAS_BLOCK__UMC:
>  		if (adev->umc.ras_funcs &&
> @@ -919,13 +943,17 @@ int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev,
>  		}
>  		break;
>  	case AMDGPU_RAS_BLOCK__GFX:
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->query_ras_error_count)
> -			adev->gfx.ras_funcs->query_ras_error_count(adev,
> &err_data);
> +		if (!block_obj || !block_obj->ops)	{
> +			dev_info(adev->dev, "%s don't config ras function \n",
> +				get_ras_block_str(&info->head));
> +			return -EINVAL;
> +		}
[Tao] Can we put the check behind "block_obj = amdgpu_ras_get_ras_block"? The same suggestion to all similar code.

> +
> +		if (block_obj->ops->query_ras_error_count)
> +			block_obj->ops->query_ras_error_count(adev,
> &err_data);
> 
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->query_ras_error_status)
> -			adev->gfx.ras_funcs->query_ras_error_status(adev);
> +		if (block_obj->ops->query_ras_error_status)
> +			block_obj->ops->query_ras_error_status(adev);
>  		break;
>  	case AMDGPU_RAS_BLOCK__MMHUB:
>  		if (adev->mmhub.ras_funcs &&
> @@ -1012,18 +1040,21 @@ int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev,  int amdgpu_ras_reset_error_status(struct
> amdgpu_device *adev,
>  		enum amdgpu_ras_block block)
>  {
> +	struct amdgpu_ras_block_object* block_obj =
> +amdgpu_ras_get_ras_block(adev, block, 0);
>  	if (!amdgpu_ras_is_supported(adev, block))
>  		return -EINVAL;
> 
>  	switch (block) {
>  	case AMDGPU_RAS_BLOCK__GFX:
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->reset_ras_error_count)
> -			adev->gfx.ras_funcs->reset_ras_error_count(adev);
> +		if (!block_obj || !block_obj->ops)	{
> +			dev_info(adev->dev, "%s don't config ras function \n",
> ras_block_str(block));
> +			return -EINVAL;
> +		}
> +		if (block_obj->ops->reset_ras_error_count)
> +			block_obj->ops->reset_ras_error_count(adev);
> 
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->reset_ras_error_status)
> -			adev->gfx.ras_funcs->reset_ras_error_status(adev);
> +		if (block_obj->ops->reset_ras_error_status)
> +			block_obj->ops->reset_ras_error_status(adev);
>  		break;
>  	case AMDGPU_RAS_BLOCK__MMHUB:
>  		if (adev->mmhub.ras_funcs &&
> @@ -1088,7 +1119,8 @@ int amdgpu_ras_error_inject(struct amdgpu_device
> *adev,
>  		.address = info->address,
>  		.value = info->value,
>  	};
> -	int ret = 0;
> +	int ret = -EINVAL;
> +	struct amdgpu_ras_block_object* block_obj =
> +amdgpu_ras_get_ras_block(adev, info->head.block,
> +info->head.sub_block_index);
> 
>  	if (!obj)
>  		return -EINVAL;
> @@ -1102,11 +1134,12 @@ int amdgpu_ras_error_inject(struct amdgpu_device
> *adev,
> 
>  	switch (info->head.block) {
>  	case AMDGPU_RAS_BLOCK__GFX:
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->ras_error_inject)
> -			ret = adev->gfx.ras_funcs->ras_error_inject(adev, info);
> -		else
> -			ret = -EINVAL;
> +		if (!block_obj || !block_obj->ops)	{
> +			dev_info(adev->dev, "%s don't config ras function \n",
> get_ras_block_str(&info->head));
> +			return -EINVAL;
> +		}
> +		if (block_obj->ops->ras_error_inject)
> +			ret = block_obj->ops->ras_error_inject(adev, info);
>  		break;
>  	case AMDGPU_RAS_BLOCK__UMC:
>  	case AMDGPU_RAS_BLOCK__SDMA:
> @@ -1727,15 +1760,20 @@ static void amdgpu_ras_log_on_err_counter(struct
> amdgpu_device *adev)  static void amdgpu_ras_error_status_query(struct
> amdgpu_device *adev,
>  					  struct ras_query_if *info)
>  {
> +	struct amdgpu_ras_block_object* block_obj =
> +amdgpu_ras_get_ras_block(adev, info->head.block,
> +info->head.sub_block_index);
>  	/*
>  	 * Only two block need to query read/write
>  	 * RspStatus at current state
>  	 */
>  	switch (info->head.block) {
>  	case AMDGPU_RAS_BLOCK__GFX:
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->query_ras_error_status)
> -			adev->gfx.ras_funcs->query_ras_error_status(adev);
> +		if (!block_obj || !block_obj->ops)	{
> +			dev_info(adev->dev, "%s don't config ras function \n",
> get_ras_block_str(&info->head));
> +			return ;
> +		}
> +
> +		if (block_obj->ops->query_ras_error_status)
> +			block_obj->ops->query_ras_error_status(adev);
>  		break;
>  	case AMDGPU_RAS_BLOCK__MMHUB:
>  		if (adev->mmhub.ras_funcs &&
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 08e91e7245df..2ffde223c4f5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -817,7 +817,7 @@ static int gfx_v9_0_get_cu_info(struct amdgpu_device
> *adev,  static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device
> *adev);  static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring);
> static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring); -static
> int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
> +static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
>  					  void *ras_error_status);
>  static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
>  				     void *inject_if);
> @@ -2118,6 +2118,18 @@ static void gfx_v9_0_select_me_pipe_q(struct
> amdgpu_device *adev,
>  	soc15_grbm_select(adev, me, pipe, q, vm);  }
> 
> +static int gfx_v9_0_ras_block_match(struct amdgpu_ras_block_object*
> +block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index) {
> +	if(!block_obj)
> +		return -EINVAL;
> +
> +	if(block_obj->block == block) {
> +		return 0;
> +	}
> +
> +	return -EINVAL;
[Tao] The return type can be changed to bool and return value is true or false instead of -EINVAL and 0.

> +}
[Tao] It's better to implement a general ras block match function in amdgpu_ras.c

> +
>  static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
>          .get_gpu_clock_counter = &gfx_v9_0_get_gpu_clock_counter,
>          .select_se_sh = &gfx_v9_0_select_se_sh, @@ -2127,12 +2139,21 @@
> static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
>          .select_me_pipe_q = &gfx_v9_0_select_me_pipe_q,  };
> 
> -static const struct amdgpu_gfx_ras_funcs gfx_v9_0_ras_funcs = {
> -	.ras_late_init = amdgpu_gfx_ras_late_init,
> -	.ras_fini = amdgpu_gfx_ras_fini,
> -	.ras_error_inject = &gfx_v9_0_ras_error_inject,
> -	.query_ras_error_count = &gfx_v9_0_query_ras_error_count,
> -	.reset_ras_error_count = &gfx_v9_0_reset_ras_error_count,
> +const struct amdgpu_ras_block_ops  gfx_v9_0_ras_ops = {

[Tao]  static const?

> +		.ras_block_match = gfx_v9_0_ras_block_match,
> +		.ras_late_init = amdgpu_gfx_ras_late_init,
> +		.ras_fini = amdgpu_gfx_ras_fini,
> +		.ras_error_inject = &gfx_v9_0_ras_error_inject,
> +		.query_ras_error_count = &gfx_v9_0_query_ras_error_count,
> +		.reset_ras_error_count = &gfx_v9_0_reset_ras_error_count, };
> +
> +static struct amdgpu_gfx_ras gfx_v9_0_ras = {
> +	.ras_block = {
> +		.name = "gfx",
> +		.block = AMDGPU_RAS_BLOCK__GFX,
> +		.ops = &gfx_v9_0_ras_ops,
> +	},
>  };
> 
>  static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev) @@ -2161,7
> +2182,7 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
>  		DRM_INFO("fix gfx.config for vega12\n");
>  		break;
>  	case CHIP_VEGA20:
> -		adev->gfx.ras_funcs = &gfx_v9_0_ras_funcs;
> +		adev->gfx.ras = &gfx_v9_0_ras;
>  		adev->gfx.config.max_hw_contexts = 8;
>  		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
>  		adev->gfx.config.sc_prim_fifo_size_backend = 0x100; @@ -
> 2187,7 +2208,7 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device
> *adev)
>  			gb_addr_config = RAVEN_GB_ADDR_CONFIG_GOLDEN;
>  		break;
>  	case CHIP_ARCTURUS:
> -		adev->gfx.ras_funcs = &gfx_v9_4_ras_funcs;
> +		adev->gfx.ras = &gfx_v9_4_ras;
>  		adev->gfx.config.max_hw_contexts = 8;
>  		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
>  		adev->gfx.config.sc_prim_fifo_size_backend = 0x100; @@ -
> 2208,7 +2229,7 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device
> *adev)
>  		gb_addr_config |= 0x22010042;
>  		break;
>  	case CHIP_ALDEBARAN:
> -		adev->gfx.ras_funcs = &gfx_v9_4_2_ras_funcs;
> +		adev->gfx.ras = &gfx_v9_4_2_ras;
>  		adev->gfx.config.max_hw_contexts = 8;
>  		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
>  		adev->gfx.config.sc_prim_fifo_size_backend = 0x100; @@ -
> 2227,6 +2248,14 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device
> *adev)
>  		break;
>  	}
> 
> +	if (adev->gfx.ras) {
> +		err = amdgpu_ras_register_ras_block(adev, &adev->gfx.ras-
> >ras_block);
> +		if (err) {
> +			DRM_ERROR("Failed to register gfx ras block!\n");
> +			return err;
> +		}
> +	}
> +
>  	adev->gfx.config.gb_addr_config = gb_addr_config;
> 
>  	adev->gfx.config.gb_addr_config_fields.num_pipes = 1 << @@ -2448,9
> +2477,9 @@ static int gfx_v9_0_sw_fini(void *handle)
>  	int i;
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> 
> -	if (adev->gfx.ras_funcs &&
> -	    adev->gfx.ras_funcs->ras_fini)
> -		adev->gfx.ras_funcs->ras_fini(adev);
> +	if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
> +	    adev->gfx.ras->ras_block.ops->ras_fini)
> +		adev->gfx.ras->ras_block.ops->ras_fini(adev);
> 
>  	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>  		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
> @@ -4888,16 +4917,16 @@ static int gfx_v9_0_ecc_late_init(void *handle)
>  	if (r)
>  		return r;
> 
> -	if (adev->gfx.ras_funcs &&
> -	    adev->gfx.ras_funcs->ras_late_init) {
> -		r = adev->gfx.ras_funcs->ras_late_init(adev);
> +	if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
> +	    adev->gfx.ras->ras_block.ops->ras_late_init) {
> +		r = adev->gfx.ras->ras_block.ops->ras_late_init(adev);
>  		if (r)
>  			return r;
>  	}
> 
> -	if (adev->gfx.ras_funcs &&
> -	    adev->gfx.ras_funcs->enable_watchdog_timer)
> -		adev->gfx.ras_funcs->enable_watchdog_timer(adev);
> +	if (adev->gfx.ras &&
> +	    adev->gfx.ras->enable_watchdog_timer)
> +		adev->gfx.ras->enable_watchdog_timer(adev);
> 
>  	return 0;
>  }
> @@ -6841,7 +6870,7 @@ static void gfx_v9_0_reset_ras_error_count(struct
> amdgpu_device *adev)
>  	WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_INDEX, 255);  }
> 
> -static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
> +static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
>  					  void *ras_error_status)
>  {
>  	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
> @@ -6850,7 +6879,7 @@ static int gfx_v9_0_query_ras_error_count(struct
> amdgpu_device *adev,
>  	uint32_t reg_value;
> 
>  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
> -		return -EINVAL;
> +		return;
> 
>  	err_data->ue_count = 0;
>  	err_data->ce_count = 0;
> @@ -6879,8 +6908,6 @@ static int gfx_v9_0_query_ras_error_count(struct
> amdgpu_device *adev,
>  	mutex_unlock(&adev->grbm_idx_mutex);
> 
>  	gfx_v9_0_query_utc_edc_status(adev, err_data);
> -
> -	return 0;
>  }
> 
>  static void gfx_v9_0_emit_mem_sync(struct amdgpu_ring *ring) diff --git
> a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> index b4789dfc2bb9..2d816addbd4d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> @@ -863,7 +863,7 @@ static int gfx_v9_4_ras_error_count(struct
> amdgpu_device *adev,
>  	return 0;
>  }
> 
> -static int gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
> +static void gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
>  					  void *ras_error_status)
>  {
>  	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
> @@ -872,7 +872,7 @@ static int gfx_v9_4_query_ras_error_count(struct
> amdgpu_device *adev,
>  	uint32_t reg_value;
> 
>  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
> -		return -EINVAL;
> +		return;
> 
>  	err_data->ue_count = 0;
>  	err_data->ce_count = 0;
> @@ -903,7 +903,6 @@ static int gfx_v9_4_query_ras_error_count(struct
> amdgpu_device *adev,
> 
>  	gfx_v9_4_query_utc_edc_status(adev, err_data);
> 
> -	return 0;
>  }
> 
>  static void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev) @@
> -1029,11 +1028,31 @@ static void gfx_v9_4_query_ras_error_status(struct
> amdgpu_device *adev)
>  	mutex_unlock(&adev->grbm_idx_mutex);
>  }
> 
> -const struct amdgpu_gfx_ras_funcs gfx_v9_4_ras_funcs = {
> -        .ras_late_init = amdgpu_gfx_ras_late_init,
> -        .ras_fini = amdgpu_gfx_ras_fini,
> -        .ras_error_inject = &gfx_v9_4_ras_error_inject,
> -        .query_ras_error_count = &gfx_v9_4_query_ras_error_count,
> -        .reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
> -        .query_ras_error_status = &gfx_v9_4_query_ras_error_status,
> +static int gfx_v9_4_ras_block_match(struct amdgpu_ras_block_object*
> +block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index) {
> +	if(!block_obj)
> +		return -EINVAL;
> +
> +	if(block_obj->block == block) {
> +		return 0;
> +	}
> +	return -EINVAL;
> +}
> +
> +const struct amdgpu_ras_block_ops  gfx_v9_4_ras_ops = {
> +	.ras_block_match = gfx_v9_4_ras_block_match,
> +	.ras_late_init = amdgpu_gfx_ras_late_init,
> +	.ras_fini = amdgpu_gfx_ras_fini,
> +	.ras_error_inject = &gfx_v9_4_ras_error_inject,
> +	.query_ras_error_count = &gfx_v9_4_query_ras_error_count,
> +	.reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
> +	.query_ras_error_status = &gfx_v9_4_query_ras_error_status, };
> +
> +struct amdgpu_gfx_ras gfx_v9_4_ras = {
> +	.ras_block = {
> +		.name = "gfx",
> +		.block = AMDGPU_RAS_BLOCK__GFX,
> +		.ops = &gfx_v9_4_ras_ops,
> +	},
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> index bdd16b568021..ca520a767267 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> @@ -24,6 +24,6 @@
>  #ifndef __GFX_V9_4_H__
>  #define __GFX_V9_4_H__
> 
> -extern const struct amdgpu_gfx_ras_funcs gfx_v9_4_ras_funcs;
> +extern struct amdgpu_gfx_ras gfx_v9_4_ras;
> 
>  #endif /* __GFX_V9_4_H__ */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> index 54306fd45ff1..2744709fa09d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> @@ -1644,14 +1644,14 @@ static int gfx_v9_4_2_query_utc_edc_count(struct
> amdgpu_device *adev,
>  	return 0;
>  }
> 
> -static int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev,
> +static void gfx_v9_4_2_query_ras_error_count(struct amdgpu_device
> +*adev,
>  					    void *ras_error_status)
>  {
>  	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
>  	uint32_t sec_count = 0, ded_count = 0;
> 
>  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
> -		return -EINVAL;
> +		return;
> 
>  	err_data->ue_count = 0;
>  	err_data->ce_count = 0;
> @@ -1664,7 +1664,6 @@ static int gfx_v9_4_2_query_ras_error_count(struct
> amdgpu_device *adev,
>  	err_data->ce_count += sec_count;
>  	err_data->ue_count += ded_count;
> 
> -	return 0;
>  }
> 
>  static void gfx_v9_4_2_reset_utc_err_status(struct amdgpu_device *adev) @@
> -1934,13 +1933,34 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct
> amdgpu_device *adev)
>  	mutex_unlock(&adev->grbm_idx_mutex);
>  }
> 
> -const struct amdgpu_gfx_ras_funcs gfx_v9_4_2_ras_funcs = {
> -	.ras_late_init = amdgpu_gfx_ras_late_init,
> -	.ras_fini = amdgpu_gfx_ras_fini,
> -	.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
> -	.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
> -	.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
> -	.query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
> -	.reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status,
> +static int gfx_v9_4_2_ras_block_match(struct amdgpu_ras_block_object*
> +block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index) {
> +	if(!block_obj)
> +		return -EINVAL;
> +
> +	if(block_obj->block == block) {
> +		return 0;
> +	}
> +
> +	return -EINVAL;
> +}
> +
> +struct amdgpu_ras_block_ops  gfx_v9_4_2_ras_ops ={
> +		.ras_block_match = gfx_v9_4_2_ras_block_match,
> +		.ras_late_init = amdgpu_gfx_ras_late_init,
> +		.ras_fini = amdgpu_gfx_ras_fini,
> +		.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
> +		.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
> +		.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
> +		.query_ras_error_status =
> &gfx_v9_4_2_query_ras_error_status,
> +		.reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status,
> +};
> +
> +struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
> +	.ras_block = {
> +		.name = "gfx",
> +		.block = AMDGPU_RAS_BLOCK__GFX,
> +		.ops = &gfx_v9_4_2_ras_ops,
> +	},
>  	.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> index 6db1f88509af..7584624b641c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> @@ -31,6 +31,6 @@ void gfx_v9_4_2_init_golden_registers(struct
> amdgpu_device *adev,  void gfx_v9_4_2_set_power_brake_sequence(struct
> amdgpu_device *adev);  int gfx_v9_4_2_do_edc_gpr_workarounds(struct
> amdgpu_device *adev);
> 
> -extern const struct amdgpu_gfx_ras_funcs gfx_v9_4_2_ras_funcs;
> +extern struct amdgpu_gfx_ras gfx_v9_4_2_ras;
> 
>  #endif /* __GFX_V9_4_2_H__ */
> --
> 2.25.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH V2 11/11] drm/amdgpu: Move error inject function from amdgpu_ras.c to each block
  2021-12-01 10:52 ` [PATCH V2 11/11] drm/amdgpu: Move error inject function from amdgpu_ras.c to each block yipechai
@ 2021-12-06  7:33   ` Zhou1, Tao
  2021-12-07  7:18     ` Chai, Thomas
  0 siblings, 1 reply; 20+ messages in thread
From: Zhou1, Tao @ 2021-12-06  7:33 UTC (permalink / raw)
  To: Chai, Thomas, amd-gfx; +Cc: Zhang, Hawking

[AMD Official Use Only]

The error injection has no difference among RAS blocks except GFX and XGMI.
I agree to move the xgmi error injection to amdgpu_xgmi.c, but I don't think it's necessary to implement specific error injection functions for all other RAS blocks.

Regards,
Tao

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Wednesday, December 1, 2021 6:53 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Chai,
> Thomas <YiPeng.Chai@amd.com>
> Subject: [PATCH V2 11/11] drm/amdgpu: Move error inject function from
> amdgpu_ras.c to each block
> 
> Move each block error inject function from amdgpu_ras.c to each block.
> 
> Signed-off-by: yipechai <YiPeng.Chai@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 62 +++++-------------------
> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 28 +++++++++++
>  drivers/gpu/drm/amd/amdgpu/mca_v3_0.c    | 18 +++++++
>  drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c  | 16 ++++++
> drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c  | 16 ++++++
> drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c  | 16 ++++++
>  drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c   | 16 ++++++
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 16 ++++++
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c   | 16 ++++++
>  drivers/gpu/drm/amd/amdgpu/umc_v6_1.c    | 16 ++++++
>  drivers/gpu/drm/amd/amdgpu/umc_v6_7.c    | 16 ++++++
>  drivers/gpu/drm/amd/amdgpu/umc_v8_7.c    | 16 ++++++
>  12 files changed, 201 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 2e38bd3d3d45..87b625d305c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1032,31 +1032,7 @@ int amdgpu_ras_reset_error_status(struct
> amdgpu_device *adev,
>  	return 0;
>  }
> 
> -/* Trigger XGMI/WAFL error */
> -static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
> -				 struct ta_ras_trigger_error_input *block_info)
> -{
> -	int ret;
> -
> -	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
> -		dev_warn(adev->dev, "Failed to disallow df cstate");
> 
> -	if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
> -		dev_warn(adev->dev, "Failed to disallow XGMI power down");
> -
> -	ret = psp_ras_trigger_error(&adev->psp, block_info);
> -
> -	if (amdgpu_ras_intr_triggered())
> -		return ret;
> -
> -	if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
> -		dev_warn(adev->dev, "Failed to allow XGMI power down");
> -
> -	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
> -		dev_warn(adev->dev, "Failed to allow df cstate");
> -
> -	return ret;
> -}
> 
>  /* wrapper of psp_ras_trigger_error */
>  int amdgpu_ras_error_inject(struct amdgpu_device *adev, @@ -1076,41
> +1052,25 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
>  	if (!obj)
>  		return -EINVAL;
> 
> +	if (!block_obj || !block_obj->ops)	{
> +		dev_info(adev->dev, "%s don't config ras function \n",
> get_ras_block_str(&info->head));
> +		return -EINVAL;
> +	}
> +
>  	/* Calculate XGMI relative offset */
>  	if (adev->gmc.xgmi.num_physical_nodes > 1) {
> -		block_info.address =
> -			amdgpu_xgmi_get_relative_phy_addr(adev,
> -							  block_info.address);
> +		block_info.address =
> amdgpu_xgmi_get_relative_phy_addr(adev,
> +block_info.address);
>  	}
> 
> -	switch (info->head.block) {
> -	case AMDGPU_RAS_BLOCK__GFX:
> -		if (!block_obj || !block_obj->ops)	{
> -			dev_info(adev->dev, "%s don't config ras function \n",
> get_ras_block_str(&info->head));
> -			return -EINVAL;
> -		}
> -		if (block_obj->ops->ras_error_inject)
> +	if (block_obj->ops->ras_error_inject) {
> +		if(info->head.block == AMDGPU_RAS_BLOCK__GFX)
>  			ret = block_obj->ops->ras_error_inject(adev, info);
> -		break;
> -	case AMDGPU_RAS_BLOCK__UMC:
> -	case AMDGPU_RAS_BLOCK__SDMA:
> -	case AMDGPU_RAS_BLOCK__MMHUB:
> -	case AMDGPU_RAS_BLOCK__PCIE_BIF:
> -	case AMDGPU_RAS_BLOCK__MCA:
> -		ret = psp_ras_trigger_error(&adev->psp, &block_info);
> -		break;
> -	case AMDGPU_RAS_BLOCK__XGMI_WAFL:
> -		ret = amdgpu_ras_error_inject_xgmi(adev, &block_info);
> -		break;
> -	default:
> -		dev_info(adev->dev, "%s error injection is not supported yet\n",
> -			 get_ras_block_str(&info->head));
> -		ret = -EINVAL;
> +		else
> +			ret = block_obj->ops->ras_error_inject(adev,
> &block_info);
>  	}
> 
>  	if (ret)
> -		dev_err(adev->dev, "ras inject %s failed %d\n",
> -			get_ras_block_str(&info->head), ret);
> +		dev_err(adev->dev, "ras inject %s failed %d\n",
> +get_ras_block_str(&info->head), ret);
> 
>  	return ret;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index da541c7b1ec2..298742afba99 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -940,6 +940,33 @@ static void
> amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
>  	err_data->ce_count += ce_cnt;
>  }
> 
> +/* Trigger XGMI/WAFL error */
> +static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
> +                                void *inject_if) {
> +       int ret = 0;;
> +       struct ta_ras_trigger_error_input *block_info =  (struct
> +ta_ras_trigger_error_input *)inject_if;
> +
> +       if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
> +               dev_warn(adev->dev, "Failed to disallow df cstate");
> +
> +       if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
> +               dev_warn(adev->dev, "Failed to disallow XGMI power
> + down");
> +
> +       ret = psp_ras_trigger_error(&adev->psp, block_info);
> +
> +       if (amdgpu_ras_intr_triggered())
> +               return ret;
> +
> +       if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
> +               dev_warn(adev->dev, "Failed to allow XGMI power down");
> +
> +       if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
> +               dev_warn(adev->dev, "Failed to allow df cstate");
> +
> +       return ret;
> +}
> +
>  static int amdgpu_xgmi_ras_block_match(struct amdgpu_ras_block_object*
> block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)  {
>  	if(!block_obj)
> @@ -958,6 +985,7 @@ struct amdgpu_ras_block_ops  xgmi_ras_ops = {
>  	.ras_fini = amdgpu_xgmi_ras_fini,
>  	.query_ras_error_count = amdgpu_xgmi_query_ras_error_count,
>  	.reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count,
> +	.ras_error_inject = amdgpu_ras_error_inject_xgmi,
>  };
> 
>  struct amdgpu_xgmi_ras xgmi_ras = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> index 99edc75ed4ec..ce6841967b05 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> @@ -60,12 +60,28 @@ static int mca_v3_0_ras_block_match(struct
> amdgpu_ras_block_object* block_obj, e
>  	return -EINVAL;
>  }
> 
> +static int mca_v3_0_ras_error_inject(struct amdgpu_device *adev, void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  const struct amdgpu_ras_block_ops mca_v3_0_mp0_ops = {
>  	.ras_block_match = mca_v3_0_ras_block_match,
>  	.ras_late_init = mca_v3_0_mp0_ras_late_init,
>  	.ras_fini = mca_v3_0_mp0_ras_fini,
>  	.query_ras_error_count = mca_v3_0_mp0_query_ras_error_count,
>  	.query_ras_error_address = NULL,
> +	.ras_error_inject = mca_v3_0_ras_error_inject,
>  };
> 
>  struct amdgpu_mca_ras_block mca_v3_0_mp0_ras = { @@ -101,6 +117,7 @@
> const struct amdgpu_ras_block_ops mca_v3_0_mp1_ops = {
>  	.ras_fini = mca_v3_0_mp1_ras_fini,
>  	.query_ras_error_count = mca_v3_0_mp1_query_ras_error_count,
>  	.query_ras_error_address = NULL,
> +	.ras_error_inject = mca_v3_0_ras_error_inject,
>  };
> 
>  struct amdgpu_mca_ras_block mca_v3_0_mp1_ras = { @@ -136,6 +153,7 @@
> const struct amdgpu_ras_block_ops mca_v3_0_mpio_ops = {
>  	.ras_fini = mca_v3_0_mpio_ras_fini,
>  	.query_ras_error_count = mca_v3_0_mpio_query_ras_error_count,
>  	.query_ras_error_address = NULL,
> +	.ras_error_inject = mca_v3_0_ras_error_inject,
>  };
> 
>  struct amdgpu_mca_ras_block mca_v3_0_mpio_ras = { diff --git
> a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> index da505314802a..7cca86c504e6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> @@ -786,12 +786,28 @@ static int mmhub_v1_0_ras_block_match(struct
> amdgpu_ras_block_object* block_obj,
>  	return -EINVAL;
>  }
> 
> +static int mmhub_v1_0_ras_error_inject(struct amdgpu_device *adev, void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  struct amdgpu_ras_block_ops mmhub_v1_0_ras_ops = {
>  	.ras_block_match = mmhub_v1_0_ras_block_match,
>  	.ras_late_init = amdgpu_mmhub_ras_late_init,
>  	.ras_fini = amdgpu_mmhub_ras_fini,
>  	.query_ras_error_count = mmhub_v1_0_query_ras_error_count,
>  	.reset_ras_error_count = mmhub_v1_0_reset_ras_error_count,
> +	.ras_error_inject = mmhub_v1_0_ras_error_inject,
>  };
> 
>  struct amdgpu_mmhub_ras mmhub_v1_0_ras = { diff --git
> a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
> b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
> index 829d14ee87d3..79a9995caef1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
> @@ -1333,6 +1333,21 @@ static int mmhub_v1_7_ras_block_match(struct
> amdgpu_ras_block_object* block_obj,
>  	return -EINVAL;
>  }
> 
> +static int mmhub_v1_7_ras_error_inject(struct amdgpu_device *adev, void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  struct amdgpu_ras_block_ops mmhub_v1_7_ras_ops = {
>  	.ras_block_match = mmhub_v1_7_ras_block_match,
>  	.ras_late_init = amdgpu_mmhub_ras_late_init, @@ -1341,6 +1356,7
> @@ struct amdgpu_ras_block_ops mmhub_v1_7_ras_ops = {
>  	.reset_ras_error_count = mmhub_v1_7_reset_ras_error_count,
>  	.query_ras_error_status = mmhub_v1_7_query_ras_error_status,
>  	.reset_ras_error_status = mmhub_v1_7_reset_ras_error_status,
> +	.ras_error_inject = mmhub_v1_7_ras_error_inject,
>  };
> 
>  struct amdgpu_mmhub_ras mmhub_v1_7_ras = { diff --git
> a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
> b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
> index 1edc98e5bcbb..eaed556b9551 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
> @@ -1667,6 +1667,21 @@ static int mmhub_v9_4_ras_block_match(struct
> amdgpu_ras_block_object* block_obj,
>  	return -EINVAL;
>  }
> 
> +static int mmhub_v9_4_ras_error_inject(struct amdgpu_device *adev, void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  const struct amdgpu_ras_block_ops mmhub_v9_4_ras_ops = {
>  	.ras_block_match = mmhub_v9_4_ras_block_match,
>  	.ras_late_init = amdgpu_mmhub_ras_late_init, @@ -1674,6 +1689,7
> @@ const struct amdgpu_ras_block_ops mmhub_v9_4_ras_ops = {
>  	.query_ras_error_count = mmhub_v9_4_query_ras_error_count,
>  	.reset_ras_error_count = mmhub_v9_4_reset_ras_error_count,
>  	.query_ras_error_status = mmhub_v9_4_query_ras_error_status,
> +	.ras_error_inject = mmhub_v9_4_ras_error_inject,
>  };
> 
>  struct amdgpu_mmhub_ras mmhub_v9_4_ras = { diff --git
> a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> index 14f7265d954e..8e62e2ffabe5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> @@ -650,11 +650,27 @@ static int nbio_v7_4_ras_block_match(struct
> amdgpu_ras_block_object* block_obj,
>  	return -EINVAL;
>  }
> 
> +static int nbio_v7_4_ras_error_inject(struct amdgpu_device *adev, void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  const struct amdgpu_ras_block_ops nbio_v7_4_ras_ops = {
>  	.ras_block_match = nbio_v7_4_ras_block_match,
>  	.query_ras_error_count = nbio_v7_4_query_ras_error_count,
>  	.ras_late_init = amdgpu_nbio_ras_late_init,
>  	.ras_fini = amdgpu_nbio_ras_fini,
> +	.ras_error_inject = nbio_v7_4_ras_error_inject,
>  };
> 
>  struct amdgpu_nbio_ras nbio_v7_4_ras = { diff --git
> a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index 30a651613776..578ee40cc0d1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -2803,11 +2803,27 @@ static int sdma_v4_0_ras_block_match(struct
> amdgpu_ras_block_object* block_obj,
>  	return -EINVAL;
>  }
> 
> +static int sdma_v4_0_ras_error_inject(struct amdgpu_device *adev, void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  const struct amdgpu_ras_block_ops sdma_v4_0_ras_ops = {
>  	.ras_block_match = sdma_v4_0_ras_block_match,
>  	.ras_fini = amdgpu_sdma_ras_fini,
>  	.query_ras_error_count = sdma_v4_0_query_ras_error_count,
>  	.reset_ras_error_count = sdma_v4_0_reset_ras_error_count,
> +	.ras_error_inject = sdma_v4_0_ras_error_inject,
>  };
> 
>  static struct amdgpu_sdma_ras sdma_v4_0_ras = { diff --git
> a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c
> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c
> index 8c165bcb0ffa..0656c6a7a2c1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c
> @@ -270,11 +270,27 @@ static int sdma_v4_4_ras_block_match(struct
> amdgpu_ras_block_object* block_obj,
>  	return -EINVAL;
>  }
> 
> +static int sdma_v4_4_ras_error_inject(struct amdgpu_device *adev, void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  const struct amdgpu_ras_block_ops sdma_v4_4_ras_ops = {
>  	.ras_block_match = sdma_v4_4_ras_block_match,
>  	.ras_fini = amdgpu_sdma_ras_fini,
>  	.query_ras_error_count = sdma_v4_4_query_ras_error_count,
>  	.reset_ras_error_count = sdma_v4_4_reset_ras_error_count,
> +	.ras_error_inject = sdma_v4_4_ras_error_inject,
>  };
> 
>  struct amdgpu_sdma_ras sdma_v4_4_ras = { diff --git
> a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
> index ed480c2081a6..2058439b02cd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
> @@ -477,12 +477,28 @@ static int umc_v6_1_ras_block_match(struct
> amdgpu_ras_block_object* block_obj, e
>  	return -EINVAL;
>  }
> 
> +static int umc_v6_1_ras_error_inject(struct amdgpu_device *adev, void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  const struct amdgpu_ras_block_ops umc_v6_1_ras_ops = {
>  	.ras_block_match = umc_v6_1_ras_block_match,
>  	.ras_late_init = amdgpu_umc_ras_late_init,
>  	.ras_fini = amdgpu_umc_ras_fini,
>  	.query_ras_error_count = umc_v6_1_query_ras_error_count,
>  	.query_ras_error_address = umc_v6_1_query_ras_error_address,
> +	.ras_error_inject = umc_v6_1_ras_error_inject,
>  };
> 
>  struct amdgpu_umc_ras umc_v6_1_ras = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> index e26728dbc6e9..2e87e7de4a55 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> @@ -333,6 +333,21 @@ static int umc_v6_7_ras_block_match(struct
> amdgpu_ras_block_object* block_obj, e
>  	return -EINVAL;
>  }
> 
> +static int umc_v6_7_ras_error_inject(struct amdgpu_device *adev, void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  const struct amdgpu_ras_block_ops umc_v6_7_ras_pos = {
>  	.ras_block_match = umc_v6_7_ras_block_match,
>  	.ras_late_init = amdgpu_umc_ras_late_init, @@ -340,6 +355,7 @@
> const struct amdgpu_ras_block_ops umc_v6_7_ras_pos = {
>  	.query_ras_error_count = umc_v6_7_query_ras_error_count,
>  	.query_ras_error_address = umc_v6_7_query_ras_error_address,
>  	.query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
> +	.ras_error_inject = umc_v6_7_ras_error_inject,
>  };
> 
>  struct amdgpu_umc_ras umc_v6_7_ras = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c
> index 037791e90c24..f7fb653434b9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c
> @@ -336,12 +336,28 @@ static int umc_v8_7_ras_block_match(struct
> amdgpu_ras_block_object* block_obj, e
>  	return -EINVAL;
>  }
> 
> +static int umc_v8_7_ras_error_inject(struct amdgpu_device *adev, void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  const struct amdgpu_ras_block_ops umc_v8_7_ras_ops = {
>  	.ras_block_match = umc_v8_7_ras_block_match,
>  	.ras_late_init = amdgpu_umc_ras_late_init,
>  	.ras_fini = amdgpu_umc_ras_fini,
>  	.query_ras_error_count = umc_v8_7_query_ras_error_count,
>  	.query_ras_error_address = umc_v8_7_query_ras_error_address,
> +	.ras_error_inject = umc_v8_7_ras_error_inject,
>  };
> 
>  struct amdgpu_umc_ras umc_v8_7_ras = {
> --
> 2.25.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras block
  2021-12-01 10:52 [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras block yipechai
                   ` (9 preceding siblings ...)
  2021-12-01 10:52 ` [PATCH V2 11/11] drm/amdgpu: Move error inject function from amdgpu_ras.c to each block yipechai
@ 2021-12-06  7:36 ` Zhou1, Tao
  10 siblings, 0 replies; 20+ messages in thread
From: Zhou1, Tao @ 2021-12-06  7:36 UTC (permalink / raw)
  To: Chai, Thomas, amd-gfx, Clements, John; +Cc: Zhang, Hawking

[AMD Official Use Only]

It's better to loop @Clements, John for the code review.

Regards,
Tao

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Wednesday, December 1, 2021 6:53 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Chai,
> Thomas <YiPeng.Chai@amd.com>
> Subject: [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras
> block
> 
> 1. Define unified ops interface for each block.
> 2. Add ras_block_match function pointer in ops interface for each ras block to
> identify itself.
> 3. Define unified basic ras block data for each ras block.
> 4. Create dedicated amdgpu device ras block link list to manage all of the ras
> blocks.
> 5. Add amdgpu_ras_register_ras_block new function interface for each ras block
> to register itself to ras controlling block.
> 
> Signed-off-by: yipechai <YiPeng.Chai@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 12 +++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    | 29 ++++++++++++++++++++++
>  4 files changed, 45 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index db1505455761..eddf230856e2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1151,6 +1151,8 @@ struct amdgpu_device {
>  	bool				barrier_has_auto_waitcnt;
> 
>  	struct amdgpu_reset_control     *reset_cntl;
> +
> +	struct list_head		ras_list;
>  };
> 
>  static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) diff
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 73ec46140d68..0980396ee709 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3578,6 +3578,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> 
>  	INIT_LIST_HEAD(&adev->reset_list);
> 
> +	INIT_LIST_HEAD(&adev->ras_list);
> +
>  	INIT_DELAYED_WORK(&adev->delayed_init_work,
>  			  amdgpu_device_delayed_init_work_handler);
>  	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 90f0db3b4f65..8713575c7cf1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2739,3 +2739,15 @@ static void
> amdgpu_register_bad_pages_mca_notifier(void)
>          }
>  }
>  #endif
> +/* Rigister each ip ras block into amdgpu ras */ int
> +amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
> +		struct amdgpu_ras_block_object* ras_block_obj) {
> +	if (!adev || !ras_block_obj)
> +		return -EINVAL;
> +
> +	INIT_LIST_HEAD(&ras_block_obj->node);
> +	list_add_tail(&ras_block_obj->node, &adev->ras_list);
> +
> +	return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index cdd0010a5389..d6e5e3c862bd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -469,6 +469,34 @@ struct ras_debug_if {
>  	};
>  	int op;
>  };
> +
> +struct amdgpu_ras_block_object {
> +	/* block name */
> +	char name[32];
> +
> +	enum amdgpu_ras_block block;
> +
> +	uint32_t sub_block_index;
> +
> +	/* ras block link */
> +	struct list_head node;
> +
> +	const struct amdgpu_ras_block_ops *ops; };
> +
> +struct amdgpu_ras_block_ops {
> +	int (*ras_block_match)(struct amdgpu_ras_block_object* block_obj,
> enum amdgpu_ras_block block, uint32_t sub_block_index);
> +	int (*ras_late_init)(struct amdgpu_device *adev);
> +	void (*ras_fini)(struct amdgpu_device *adev);
> +	int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if);
> +	void  (*query_ras_error_count)(struct amdgpu_device *adev,void
> *ras_error_status);
> +	void (*query_ras_error_status)(struct amdgpu_device *adev);
> +	bool  (*query_ras_poison_mode)(struct amdgpu_device *adev);
> +	void (*query_ras_error_address)(struct amdgpu_device *adev, void
> *ras_error_status);
> +	void (*reset_ras_error_count)(struct amdgpu_device *adev);
> +	void (*reset_ras_error_status)(struct amdgpu_device *adev); };
> +
>  /* work flow
>   * vbios
>   * 1: ras feature enable (enabled by default) @@ -652,4 +680,5 @@ const char
> *get_ras_block_str(struct ras_common_if *ras_block);
> 
>  bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev);
> 
> +int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct
> +amdgpu_ras_block_object* ras_block_obj);
>  #endif
> --
> 2.25.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH V2 02/11] drm/amdgpu: Modify the compilation failed problem when other ras blocks' .h include amdgpu_ras.h
  2021-12-06  6:56   ` Zhou1, Tao
@ 2021-12-07  2:31     ` Chai, Thomas
  0 siblings, 0 replies; 20+ messages in thread
From: Chai, Thomas @ 2021-12-07  2:31 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx; +Cc: Zhang, Hawking



-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com> 
Sent: Monday, December 6, 2021 2:57 PM
To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: RE: [PATCH V2 02/11] drm/amdgpu: Modify the compilation failed problem when other ras blocks' .h include amdgpu_ras.h

[AMD Official Use Only]



> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Wednesday, December 1, 2021 6:53 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking 
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Chai, Thomas 
> <YiPeng.Chai@amd.com>
> Subject: [PATCH V2 02/11] drm/amdgpu: Modify the compilation failed 
> problem when other ras blocks' .h include amdgpu_ras.h
> 
> Modify the compilation failed problem when other ras blocks' .h 
> include

>[Tao] 'Fix' is better than "Modify" here.
[Thomas] OK.

> amdgpu_ras.h.
> 
> Signed-off-by: yipechai <YiPeng.Chai@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 ++++++++++++++++++++++ 
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 23 ++++-------------------
>  2 files changed, 26 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 8713575c7cf1..1cf1f6331db1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2739,6 +2739,28 @@ static void
> amdgpu_register_bad_pages_mca_notifier(void)
>          }
>  }
>  #endif
> +
> +/* check if ras is supported on block, say, sdma, gfx */ int 
> +amdgpu_ras_is_supported(struct amdgpu_device *adev,
> +		unsigned int block)
> +{
> +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> +	if (block >= AMDGPU_RAS_BLOCK_COUNT)
> +		return 0;
> +	return ras && (adev->ras_enabled & (1 << block)); }
> +
> +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) {
> +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> +	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
> +		schedule_work(&ras->recovery_work);
> +	return 0;
> +}
> +
> +
>  /* Rigister each ip ras block into amdgpu ras */  int 
> amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
>  		struct amdgpu_ras_block_object* ras_block_obj) diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index d6e5e3c862bd..41623a649fa1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -514,16 +514,6 @@ struct amdgpu_ras_block_ops {
>  #define amdgpu_ras_get_context(adev)		((adev)->psp.ras_context.ras)
>  #define amdgpu_ras_set_context(adev, ras_con)	((adev)-
> >psp.ras_context.ras = (ras_con))
> 
> -/* check if ras is supported on block, say, sdma, gfx */ -static 
> inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
> -		unsigned int block)
> -{
> -	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> -
> -	if (block >= AMDGPU_RAS_BLOCK_COUNT)
> -		return 0;
> -	return ras && (adev->ras_enabled & (1 << block));
> -}
> 
>  int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
> 
> @@ -540,15 +530,6 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device 
> *adev,
> 
>  int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
> 
> -static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) -{
> -	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> -
> -	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
> -		schedule_work(&ras->recovery_work);
> -	return 0;
> -}
> -
>  static inline enum ta_ras_block
>  amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
>  	switch (block) {
> @@ -680,5 +661,9 @@ const char *get_ras_block_str(struct ras_common_if 
> *ras_block);
> 
>  bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev);
> 
> +int amdgpu_ras_is_supported(struct amdgpu_device *adev,	unsigned int
> block);
> +
> +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev);
> +
>  int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct
> amdgpu_ras_block_object* ras_block_obj);  #endif
> --
> 2.25.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the unified ras block data and ops
  2021-12-06  6:58   ` Zhou1, Tao
@ 2021-12-07  3:37     ` Chai, Thomas
  2021-12-07  4:06       ` Zhou1, Tao
  0 siblings, 1 reply; 20+ messages in thread
From: Chai, Thomas @ 2021-12-07  3:37 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx; +Cc: Zhang, Hawking

Hi tao:
     I add my comments behind your comments. Please review.

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com> 
Sent: Monday, December 6, 2021 2:58 PM
To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: RE: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the unified ras block data and ops

[AMD Official Use Only]

Please see my comments inline.

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Wednesday, December 1, 2021 6:53 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking 
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Chai, Thomas 
> <YiPeng.Chai@amd.com>
> Subject: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the 
> unified ras block data and ops
> 
> 1.Modify gfx block to fit for the unified ras block data and ops 
> 2.Implement .ras_block_match function pointer for gfx block to identify itself.
> 3.Change amdgpu_gfx_ras_funcs to amdgpu_gfx_ras, and the corresponding 
> variable name remove _funcs suffix.
> 4.Remove the const flag of gfx ras variable so that gfx ras block can 
> be able to be insertted into amdgpu device ras block link list.
> 5.Invoke amdgpu_ras_register_ras_block function to register gfx ras 
> block into amdgpu device ras block link list.
> 6.Remove the redundant code about gfx in amdgpu_ras.c after using the 
> unified ras block.
> 
> Signed-off-by: yipechai <YiPeng.Chai@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  6 +- 
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 15 ++--- 
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 80 ++++++++++++++++++-------
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 73 +++++++++++++++-------
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c   | 39 ++++++++----
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h   |  2 +-
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 42 +++++++++---- 
> drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h |  2 +-
>  8 files changed, 178 insertions(+), 81 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 1795d448c700..da8691259ac1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -696,9 +696,9 @@ int amdgpu_gfx_process_ras_data_cb(struct
> amdgpu_device *adev,
>  	 */
>  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
>  		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->query_ras_error_count)
> -			adev->gfx.ras_funcs->query_ras_error_count(adev,
> err_data);
> +		if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
> +		    adev->gfx.ras->ras_block.ops->query_ras_error_count)
> +			adev->gfx.ras->ras_block.ops-
> >query_ras_error_count(adev, err_data);
>  		amdgpu_ras_reset_gpu(adev);
>  	}
>  	return AMDGPU_RAS_SUCCESS;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 6b78b4a0e182..ff4a8428a84b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -31,6 +31,7 @@
>  #include "amdgpu_ring.h"
>  #include "amdgpu_rlc.h"
>  #include "soc15.h"
> +#include "amdgpu_ras.h"
> 
>  /* GFX current status */
>  #define AMDGPU_GFX_NORMAL_MODE			0x00000000L
> @@ -213,16 +214,8 @@ struct amdgpu_cu_info {
>  	uint32_t bitmap[4][4];
>  };
> 
> -struct amdgpu_gfx_ras_funcs {
> -	int (*ras_late_init)(struct amdgpu_device *adev);
> -	void (*ras_fini)(struct amdgpu_device *adev);
> -	int (*ras_error_inject)(struct amdgpu_device *adev,
> -				void *inject_if);
> -	int (*query_ras_error_count)(struct amdgpu_device *adev,
> -				     void *ras_error_status);
> -	void (*reset_ras_error_count)(struct amdgpu_device *adev);
> -	void (*query_ras_error_status)(struct amdgpu_device *adev);
> -	void (*reset_ras_error_status)(struct amdgpu_device *adev);
> +struct amdgpu_gfx_ras {
> +	struct amdgpu_ras_block_object  ras_block;
>  	void (*enable_watchdog_timer)(struct amdgpu_device *adev);  };

>[Tao] Can we add " enable_watchdog_timer" function into amdgpu_ras_block_ops structure?
>And I think using ras_block directly is more simple than amdgpu_gfx_ras gfx_v9_0_ras structure.

[Thomas] The ' enable_watchdog_timer ' function is not a common function. It is only defined by gfx_v9_4_2.c and called in gfx_v9_0.c. 
	   I think the function pointers in the amdgpu_ras_block_ops structure should be the functions used by most blocks and the final goal of amdgpu_ras_block_ops structure is to eliminate explicit calls to special blocks in amdgpu_ras.c file. 
                 So, I think it had better that the enable_watchdog_timer function only use in gfx but not move to amdgpu_ras_block_ops.

> 
> @@ -348,7 +341,7 @@ struct amdgpu_gfx {
> 
>  	/*ras */
>  	struct ras_common_if			*ras_if;
> -	const struct amdgpu_gfx_ras_funcs	*ras_funcs;
> +	struct amdgpu_gfx_ras	*ras;
>  };
> 
>  #define amdgpu_gfx_get_gpu_clock_counter(adev) (adev)->gfx.funcs-
> >get_gpu_clock_counter((adev))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 1cf1f6331db1..190a4a4e9d7a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -862,6 +862,27 @@ static int amdgpu_ras_enable_all_features(struct
> amdgpu_device *adev,  }
>  /* feature ctl end */
> 
> +static struct amdgpu_ras_block_object* 
> +amdgpu_ras_get_ras_block(struct
> amdgpu_device *adev,
> +					enum amdgpu_ras_block block,
> uint32_t sub_block_index) {
> +	struct amdgpu_ras_block_object *obj, *tmp;
> +
> +	if (block >= AMDGPU_RAS_BLOCK__LAST) {
> +		return NULL;
> +	}
>[Tao] The "{}" can be dropped since only one line under the if.
    [Thomas] OK.

> +
> +	list_for_each_entry_safe(obj, tmp, &adev->ras_list, node) {
> +		if( !obj->ops || !obj->ops->ras_block_match) {
[Tao]  Need a space after "if" and the space before "!obj" can be removed. 

> +			dev_info(adev->dev, "%s don't config ops or
> ras_block_match\n", obj->name);
> +			continue;
> +		}
> +		if (!obj->ops->ras_block_match(obj, block, sub_block_index)) {
> +			return obj;
> +		}
>[Tao] The "{}" can be removed.
      [Thomas] OK.

> +	}
> +
> +	return NULL;
> +}
>[Tao] This is a generic ras function, not gfx specific, the code can be moved to patch #1.
    [Thomas] OK.
> 
>  void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
>  				       struct ras_common_if *ras_block, @@ -
> 892,6 +913,7 @@ void amdgpu_ras_mca_query_error_status(struct
> amdgpu_device *adev,  int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev,
>  				  struct ras_query_if *info)
>  {
> +	struct amdgpu_ras_block_object* block_obj = NULL;
>  	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
>  	struct ras_err_data err_data = {0, 0, 0, NULL};
>  	int i;
> @@ -899,6 +921,8 @@ int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev,
>  	if (!obj)
>  		return -EINVAL;
> 
> +	block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
> +
>  	switch (info->head.block) {
>  	case AMDGPU_RAS_BLOCK__UMC:
>  		if (adev->umc.ras_funcs &&
> @@ -919,13 +943,17 @@ int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev,
>  		}
>  		break;
>  	case AMDGPU_RAS_BLOCK__GFX:
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->query_ras_error_count)
> -			adev->gfx.ras_funcs->query_ras_error_count(adev,
> &err_data);
> +		if (!block_obj || !block_obj->ops)	{
> +			dev_info(adev->dev, "%s don't config ras function \n",
> +				get_ras_block_str(&info->head));
> +			return -EINVAL;
> +		}
>[Tao] Can we put the check behind "block_obj = amdgpu_ras_get_ras_block"? The same suggestion to all similar code.
       [Thomas] OK.
> +
> +		if (block_obj->ops->query_ras_error_count)
> +			block_obj->ops->query_ras_error_count(adev,
> &err_data);
> 
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->query_ras_error_status)
> -			adev->gfx.ras_funcs->query_ras_error_status(adev);
> +		if (block_obj->ops->query_ras_error_status)
> +			block_obj->ops->query_ras_error_status(adev);
>  		break;
>  	case AMDGPU_RAS_BLOCK__MMHUB:
>  		if (adev->mmhub.ras_funcs &&
> @@ -1012,18 +1040,21 @@ int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev,  int amdgpu_ras_reset_error_status(struct
> amdgpu_device *adev,
>  		enum amdgpu_ras_block block)
>  {
> +	struct amdgpu_ras_block_object* block_obj = 
> +amdgpu_ras_get_ras_block(adev, block, 0);
>  	if (!amdgpu_ras_is_supported(adev, block))
>  		return -EINVAL;
> 
>  	switch (block) {
>  	case AMDGPU_RAS_BLOCK__GFX:
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->reset_ras_error_count)
> -			adev->gfx.ras_funcs->reset_ras_error_count(adev);
> +		if (!block_obj || !block_obj->ops)	{
> +			dev_info(adev->dev, "%s don't config ras function \n",
> ras_block_str(block));
> +			return -EINVAL;
> +		}
> +		if (block_obj->ops->reset_ras_error_count)
> +			block_obj->ops->reset_ras_error_count(adev);
> 
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->reset_ras_error_status)
> -			adev->gfx.ras_funcs->reset_ras_error_status(adev);
> +		if (block_obj->ops->reset_ras_error_status)
> +			block_obj->ops->reset_ras_error_status(adev);
>  		break;
>  	case AMDGPU_RAS_BLOCK__MMHUB:
>  		if (adev->mmhub.ras_funcs &&
> @@ -1088,7 +1119,8 @@ int amdgpu_ras_error_inject(struct amdgpu_device 
> *adev,
>  		.address = info->address,
>  		.value = info->value,
>  	};
> -	int ret = 0;
> +	int ret = -EINVAL;
> +	struct amdgpu_ras_block_object* block_obj = 
> +amdgpu_ras_get_ras_block(adev, info->head.block,
> +info->head.sub_block_index);
> 
>  	if (!obj)
>  		return -EINVAL;
> @@ -1102,11 +1134,12 @@ int amdgpu_ras_error_inject(struct 
> amdgpu_device *adev,
> 
>  	switch (info->head.block) {
>  	case AMDGPU_RAS_BLOCK__GFX:
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->ras_error_inject)
> -			ret = adev->gfx.ras_funcs->ras_error_inject(adev, info);
> -		else
> -			ret = -EINVAL;
> +		if (!block_obj || !block_obj->ops)	{
> +			dev_info(adev->dev, "%s don't config ras function \n",
> get_ras_block_str(&info->head));
> +			return -EINVAL;
> +		}
> +		if (block_obj->ops->ras_error_inject)
> +			ret = block_obj->ops->ras_error_inject(adev, info);
>  		break;
>  	case AMDGPU_RAS_BLOCK__UMC:
>  	case AMDGPU_RAS_BLOCK__SDMA:
> @@ -1727,15 +1760,20 @@ static void 
> amdgpu_ras_log_on_err_counter(struct
> amdgpu_device *adev)  static void amdgpu_ras_error_status_query(struct
> amdgpu_device *adev,
>  					  struct ras_query_if *info)
>  {
> +	struct amdgpu_ras_block_object* block_obj = 
> +amdgpu_ras_get_ras_block(adev, info->head.block,
> +info->head.sub_block_index);
>  	/*
>  	 * Only two block need to query read/write
>  	 * RspStatus at current state
>  	 */
>  	switch (info->head.block) {
>  	case AMDGPU_RAS_BLOCK__GFX:
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->query_ras_error_status)
> -			adev->gfx.ras_funcs->query_ras_error_status(adev);
> +		if (!block_obj || !block_obj->ops)	{
> +			dev_info(adev->dev, "%s don't config ras function \n",
> get_ras_block_str(&info->head));
> +			return ;
> +		}
> +
> +		if (block_obj->ops->query_ras_error_status)
> +			block_obj->ops->query_ras_error_status(adev);
>  		break;
>  	case AMDGPU_RAS_BLOCK__MMHUB:
>  		if (adev->mmhub.ras_funcs &&
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 08e91e7245df..2ffde223c4f5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -817,7 +817,7 @@ static int gfx_v9_0_get_cu_info(struct 
> amdgpu_device *adev,  static uint64_t 
> gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev);  static 
> void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring); static u64 
> gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring); -static int 
> gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
> +static void gfx_v9_0_query_ras_error_count(struct amdgpu_device 
> +*adev,
>  					  void *ras_error_status);
>  static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
>  				     void *inject_if);
> @@ -2118,6 +2118,18 @@ static void gfx_v9_0_select_me_pipe_q(struct 
> amdgpu_device *adev,
>  	soc15_grbm_select(adev, me, pipe, q, vm);  }
> 
> +static int gfx_v9_0_ras_block_match(struct amdgpu_ras_block_object* 
> +block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index) {
> +	if(!block_obj)
> +		return -EINVAL;
> +
> +	if(block_obj->block == block) {
> +		return 0;
> +	}
> +
> +	return -EINVAL;
>[Tao] The return type can be changed to bool and return value is true or false instead of -EINVAL and 0.
       [Thomas] I think the return type is int maybe have more scalability for a unified ops interface.

> +}
>[Tao] It's better to implement a general ras block match function in amdgpu_ras.c
       [Thomas] The match method of mca block is different from other blocks. Others blocks only use block to match, but mac block should use block and sub block index to match.
	          But I can add a default match function in the amdgpu_ras.c, if ip block does't define .ras_block_match function, it will use the default match function in amdgpu_ras.c.
> +
>  static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
>          .get_gpu_clock_counter = &gfx_v9_0_get_gpu_clock_counter,
>          .select_se_sh = &gfx_v9_0_select_se_sh, @@ -2127,12 +2139,21 
> @@ static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
>          .select_me_pipe_q = &gfx_v9_0_select_me_pipe_q,  };
> 
> -static const struct amdgpu_gfx_ras_funcs gfx_v9_0_ras_funcs = {
> -	.ras_late_init = amdgpu_gfx_ras_late_init,
> -	.ras_fini = amdgpu_gfx_ras_fini,
> -	.ras_error_inject = &gfx_v9_0_ras_error_inject,
> -	.query_ras_error_count = &gfx_v9_0_query_ras_error_count,
> -	.reset_ras_error_count = &gfx_v9_0_reset_ras_error_count,
> +const struct amdgpu_ras_block_ops  gfx_v9_0_ras_ops = {

>[Tao]  static const?
    [Thomas] OK.
> +		.ras_block_match = gfx_v9_0_ras_block_match,
> +		.ras_late_init = amdgpu_gfx_ras_late_init,
> +		.ras_fini = amdgpu_gfx_ras_fini,
> +		.ras_error_inject = &gfx_v9_0_ras_error_inject,
> +		.query_ras_error_count = &gfx_v9_0_query_ras_error_count,
> +		.reset_ras_error_count = &gfx_v9_0_reset_ras_error_count, };
> +
> +static struct amdgpu_gfx_ras gfx_v9_0_ras = {
> +	.ras_block = {
> +		.name = "gfx",
> +		.block = AMDGPU_RAS_BLOCK__GFX,
> +		.ops = &gfx_v9_0_ras_ops,
> +	},
>  };
> 
>  static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev) @@ 
> -2161,7
> +2182,7 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device 
> +*adev)
>  		DRM_INFO("fix gfx.config for vega12\n");
>  		break;
>  	case CHIP_VEGA20:
> -		adev->gfx.ras_funcs = &gfx_v9_0_ras_funcs;
> +		adev->gfx.ras = &gfx_v9_0_ras;
>  		adev->gfx.config.max_hw_contexts = 8;
>  		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
>  		adev->gfx.config.sc_prim_fifo_size_backend = 0x100; @@ -
> 2187,7 +2208,7 @@ static int gfx_v9_0_gpu_early_init(struct 
> amdgpu_device
> *adev)
>  			gb_addr_config = RAVEN_GB_ADDR_CONFIG_GOLDEN;
>  		break;
>  	case CHIP_ARCTURUS:
> -		adev->gfx.ras_funcs = &gfx_v9_4_ras_funcs;
> +		adev->gfx.ras = &gfx_v9_4_ras;
>  		adev->gfx.config.max_hw_contexts = 8;
>  		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
>  		adev->gfx.config.sc_prim_fifo_size_backend = 0x100; @@ -
> 2208,7 +2229,7 @@ static int gfx_v9_0_gpu_early_init(struct 
> amdgpu_device
> *adev)
>  		gb_addr_config |= 0x22010042;
>  		break;
>  	case CHIP_ALDEBARAN:
> -		adev->gfx.ras_funcs = &gfx_v9_4_2_ras_funcs;
> +		adev->gfx.ras = &gfx_v9_4_2_ras;
>  		adev->gfx.config.max_hw_contexts = 8;
>  		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
>  		adev->gfx.config.sc_prim_fifo_size_backend = 0x100; @@ -
> 2227,6 +2248,14 @@ static int gfx_v9_0_gpu_early_init(struct 
> amdgpu_device
> *adev)
>  		break;
>  	}
> 
> +	if (adev->gfx.ras) {
> +		err = amdgpu_ras_register_ras_block(adev, &adev->gfx.ras-
> >ras_block);
> +		if (err) {
> +			DRM_ERROR("Failed to register gfx ras block!\n");
> +			return err;
> +		}
> +	}
> +
>  	adev->gfx.config.gb_addr_config = gb_addr_config;
> 
>  	adev->gfx.config.gb_addr_config_fields.num_pipes = 1 << @@ -2448,9
> +2477,9 @@ static int gfx_v9_0_sw_fini(void *handle)
>  	int i;
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> 
> -	if (adev->gfx.ras_funcs &&
> -	    adev->gfx.ras_funcs->ras_fini)
> -		adev->gfx.ras_funcs->ras_fini(adev);
> +	if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
> +	    adev->gfx.ras->ras_block.ops->ras_fini)
> +		adev->gfx.ras->ras_block.ops->ras_fini(adev);
> 
>  	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>  		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
> @@ -4888,16 +4917,16 @@ static int gfx_v9_0_ecc_late_init(void *handle)
>  	if (r)
>  		return r;
> 
> -	if (adev->gfx.ras_funcs &&
> -	    adev->gfx.ras_funcs->ras_late_init) {
> -		r = adev->gfx.ras_funcs->ras_late_init(adev);
> +	if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
> +	    adev->gfx.ras->ras_block.ops->ras_late_init) {
> +		r = adev->gfx.ras->ras_block.ops->ras_late_init(adev);
>  		if (r)
>  			return r;
>  	}
> 
> -	if (adev->gfx.ras_funcs &&
> -	    adev->gfx.ras_funcs->enable_watchdog_timer)
> -		adev->gfx.ras_funcs->enable_watchdog_timer(adev);
> +	if (adev->gfx.ras &&
> +	    adev->gfx.ras->enable_watchdog_timer)
> +		adev->gfx.ras->enable_watchdog_timer(adev);
> 
>  	return 0;
>  }
> @@ -6841,7 +6870,7 @@ static void 
> gfx_v9_0_reset_ras_error_count(struct
> amdgpu_device *adev)
>  	WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_INDEX, 255);  }
> 
> -static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
> +static void gfx_v9_0_query_ras_error_count(struct amdgpu_device 
> +*adev,
>  					  void *ras_error_status)
>  {
>  	struct ras_err_data *err_data = (struct ras_err_data 
> *)ras_error_status; @@ -6850,7 +6879,7 @@ static int 
> gfx_v9_0_query_ras_error_count(struct
> amdgpu_device *adev,
>  	uint32_t reg_value;
> 
>  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
> -		return -EINVAL;
> +		return;
> 
>  	err_data->ue_count = 0;
>  	err_data->ce_count = 0;
> @@ -6879,8 +6908,6 @@ static int gfx_v9_0_query_ras_error_count(struct
> amdgpu_device *adev,
>  	mutex_unlock(&adev->grbm_idx_mutex);
> 
>  	gfx_v9_0_query_utc_edc_status(adev, err_data);
> -
> -	return 0;
>  }
> 
>  static void gfx_v9_0_emit_mem_sync(struct amdgpu_ring *ring) diff 
> --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> index b4789dfc2bb9..2d816addbd4d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> @@ -863,7 +863,7 @@ static int gfx_v9_4_ras_error_count(struct 
> amdgpu_device *adev,
>  	return 0;
>  }
> 
> -static int gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
> +static void gfx_v9_4_query_ras_error_count(struct amdgpu_device 
> +*adev,
>  					  void *ras_error_status)
>  {
>  	struct ras_err_data *err_data = (struct ras_err_data 
> *)ras_error_status; @@ -872,7 +872,7 @@ static int 
> gfx_v9_4_query_ras_error_count(struct
> amdgpu_device *adev,
>  	uint32_t reg_value;
> 
>  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
> -		return -EINVAL;
> +		return;
> 
>  	err_data->ue_count = 0;
>  	err_data->ce_count = 0;
> @@ -903,7 +903,6 @@ static int gfx_v9_4_query_ras_error_count(struct
> amdgpu_device *adev,
> 
>  	gfx_v9_4_query_utc_edc_status(adev, err_data);
> 
> -	return 0;
>  }
> 
>  static void gfx_v9_4_reset_ras_error_count(struct amdgpu_device 
> *adev) @@
> -1029,11 +1028,31 @@ static void 
> gfx_v9_4_query_ras_error_status(struct
> amdgpu_device *adev)
>  	mutex_unlock(&adev->grbm_idx_mutex);
>  }
> 
> -const struct amdgpu_gfx_ras_funcs gfx_v9_4_ras_funcs = {
> -        .ras_late_init = amdgpu_gfx_ras_late_init,
> -        .ras_fini = amdgpu_gfx_ras_fini,
> -        .ras_error_inject = &gfx_v9_4_ras_error_inject,
> -        .query_ras_error_count = &gfx_v9_4_query_ras_error_count,
> -        .reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
> -        .query_ras_error_status = &gfx_v9_4_query_ras_error_status,
> +static int gfx_v9_4_ras_block_match(struct amdgpu_ras_block_object* 
> +block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index) {
> +	if(!block_obj)
> +		return -EINVAL;
> +
> +	if(block_obj->block == block) {
> +		return 0;
> +	}
> +	return -EINVAL;
> +}
> +
> +const struct amdgpu_ras_block_ops  gfx_v9_4_ras_ops = {
> +	.ras_block_match = gfx_v9_4_ras_block_match,
> +	.ras_late_init = amdgpu_gfx_ras_late_init,
> +	.ras_fini = amdgpu_gfx_ras_fini,
> +	.ras_error_inject = &gfx_v9_4_ras_error_inject,
> +	.query_ras_error_count = &gfx_v9_4_query_ras_error_count,
> +	.reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
> +	.query_ras_error_status = &gfx_v9_4_query_ras_error_status, };
> +
> +struct amdgpu_gfx_ras gfx_v9_4_ras = {
> +	.ras_block = {
> +		.name = "gfx",
> +		.block = AMDGPU_RAS_BLOCK__GFX,
> +		.ops = &gfx_v9_4_ras_ops,
> +	},
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> index bdd16b568021..ca520a767267 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> @@ -24,6 +24,6 @@
>  #ifndef __GFX_V9_4_H__
>  #define __GFX_V9_4_H__
> 
> -extern const struct amdgpu_gfx_ras_funcs gfx_v9_4_ras_funcs;
> +extern struct amdgpu_gfx_ras gfx_v9_4_ras;
> 
>  #endif /* __GFX_V9_4_H__ */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> index 54306fd45ff1..2744709fa09d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> @@ -1644,14 +1644,14 @@ static int 
> gfx_v9_4_2_query_utc_edc_count(struct
> amdgpu_device *adev,
>  	return 0;
>  }
> 
> -static int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device 
> *adev,
> +static void gfx_v9_4_2_query_ras_error_count(struct amdgpu_device 
> +*adev,
>  					    void *ras_error_status)
>  {
>  	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
>  	uint32_t sec_count = 0, ded_count = 0;
> 
>  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
> -		return -EINVAL;
> +		return;
> 
>  	err_data->ue_count = 0;
>  	err_data->ce_count = 0;
> @@ -1664,7 +1664,6 @@ static int 
> gfx_v9_4_2_query_ras_error_count(struct
> amdgpu_device *adev,
>  	err_data->ce_count += sec_count;
>  	err_data->ue_count += ded_count;
> 
> -	return 0;
>  }
> 
>  static void gfx_v9_4_2_reset_utc_err_status(struct amdgpu_device 
> *adev) @@
> -1934,13 +1933,34 @@ static void 
> gfx_v9_4_2_reset_sq_timeout_status(struct
> amdgpu_device *adev)
>  	mutex_unlock(&adev->grbm_idx_mutex);
>  }
> 
> -const struct amdgpu_gfx_ras_funcs gfx_v9_4_2_ras_funcs = {
> -	.ras_late_init = amdgpu_gfx_ras_late_init,
> -	.ras_fini = amdgpu_gfx_ras_fini,
> -	.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
> -	.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
> -	.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
> -	.query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
> -	.reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status,
> +static int gfx_v9_4_2_ras_block_match(struct amdgpu_ras_block_object* 
> +block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index) {
> +	if(!block_obj)
> +		return -EINVAL;
> +
> +	if(block_obj->block == block) {
> +		return 0;
> +	}
> +
> +	return -EINVAL;
> +}
> +
> +struct amdgpu_ras_block_ops  gfx_v9_4_2_ras_ops ={
> +		.ras_block_match = gfx_v9_4_2_ras_block_match,
> +		.ras_late_init = amdgpu_gfx_ras_late_init,
> +		.ras_fini = amdgpu_gfx_ras_fini,
> +		.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
> +		.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
> +		.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
> +		.query_ras_error_status =
> &gfx_v9_4_2_query_ras_error_status,
> +		.reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status,
> +};
> +
> +struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
> +	.ras_block = {
> +		.name = "gfx",
> +		.block = AMDGPU_RAS_BLOCK__GFX,
> +		.ops = &gfx_v9_4_2_ras_ops,
> +	},
>  	.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> index 6db1f88509af..7584624b641c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> @@ -31,6 +31,6 @@ void gfx_v9_4_2_init_golden_registers(struct
> amdgpu_device *adev,  void gfx_v9_4_2_set_power_brake_sequence(struct
> amdgpu_device *adev);  int gfx_v9_4_2_do_edc_gpr_workarounds(struct
> amdgpu_device *adev);
> 
> -extern const struct amdgpu_gfx_ras_funcs gfx_v9_4_2_ras_funcs;
> +extern struct amdgpu_gfx_ras gfx_v9_4_2_ras;
> 
>  #endif /* __GFX_V9_4_2_H__ */
> --
> 2.25.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the unified ras block data and ops
  2021-12-07  3:37     ` Chai, Thomas
@ 2021-12-07  4:06       ` Zhou1, Tao
  2021-12-07  6:31         ` Chai, Thomas
  0 siblings, 1 reply; 20+ messages in thread
From: Zhou1, Tao @ 2021-12-07  4:06 UTC (permalink / raw)
  To: Chai, Thomas, amd-gfx; +Cc: Zhang, Hawking

[AMD Official Use Only]

Hi Thomas,

Please see my two comments.

Regards,
Tao

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Tuesday, December 7, 2021 11:37 AM
> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: RE: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the
> unified ras block data and ops
> 
> Hi tao:
>      I add my comments behind your comments. Please review.
> 
> -----Original Message-----
> From: Zhou1, Tao <Tao.Zhou1@amd.com>
> Sent: Monday, December 6, 2021 2:58 PM
> To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: RE: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the
> unified ras block data and ops
> 
> [AMD Official Use Only]
> 
> Please see my comments inline.
> 
> > -----Original Message-----
> > From: Chai, Thomas <YiPeng.Chai@amd.com>
> > Sent: Wednesday, December 1, 2021 6:53 PM
> > To: amd-gfx@lists.freedesktop.org
> > Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking
> > <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Chai,
> Thomas
> > <YiPeng.Chai@amd.com>
> > Subject: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the
> > unified ras block data and ops
> >
> > 1.Modify gfx block to fit for the unified ras block data and ops
> > 2.Implement .ras_block_match function pointer for gfx block to identify itself.
> > 3.Change amdgpu_gfx_ras_funcs to amdgpu_gfx_ras, and the corresponding
> > variable name remove _funcs suffix.
> > 4.Remove the const flag of gfx ras variable so that gfx ras block can
> > be able to be insertted into amdgpu device ras block link list.
> > 5.Invoke amdgpu_ras_register_ras_block function to register gfx ras
> > block into amdgpu device ras block link list.
> > 6.Remove the redundant code about gfx in amdgpu_ras.c after using the
> > unified ras block.
> >
> > Signed-off-by: yipechai <YiPeng.Chai@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  6 +-
> > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 15 ++---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 80 ++++++++++++++++++------
> -
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 73 +++++++++++++++-------
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c   | 39 ++++++++----
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h   |  2 +-
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 42 +++++++++----
> > drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h |  2 +-
> >  8 files changed, 178 insertions(+), 81 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > index 1795d448c700..da8691259ac1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > @@ -696,9 +696,9 @@ int amdgpu_gfx_process_ras_data_cb(struct
> > amdgpu_device *adev,
> >  	 */
> >  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
> >  		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> > -		if (adev->gfx.ras_funcs &&
> > -		    adev->gfx.ras_funcs->query_ras_error_count)
> > -			adev->gfx.ras_funcs->query_ras_error_count(adev,
> > err_data);
> > +		if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
> > +		    adev->gfx.ras->ras_block.ops->query_ras_error_count)
> > +			adev->gfx.ras->ras_block.ops-
> > >query_ras_error_count(adev, err_data);
> >  		amdgpu_ras_reset_gpu(adev);
> >  	}
> >  	return AMDGPU_RAS_SUCCESS;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > index 6b78b4a0e182..ff4a8428a84b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > @@ -31,6 +31,7 @@
> >  #include "amdgpu_ring.h"
> >  #include "amdgpu_rlc.h"
> >  #include "soc15.h"
> > +#include "amdgpu_ras.h"
> >
> >  /* GFX current status */
> >  #define AMDGPU_GFX_NORMAL_MODE			0x00000000L
> > @@ -213,16 +214,8 @@ struct amdgpu_cu_info {
> >  	uint32_t bitmap[4][4];
> >  };
> >
> > -struct amdgpu_gfx_ras_funcs {
> > -	int (*ras_late_init)(struct amdgpu_device *adev);
> > -	void (*ras_fini)(struct amdgpu_device *adev);
> > -	int (*ras_error_inject)(struct amdgpu_device *adev,
> > -				void *inject_if);
> > -	int (*query_ras_error_count)(struct amdgpu_device *adev,
> > -				     void *ras_error_status);
> > -	void (*reset_ras_error_count)(struct amdgpu_device *adev);
> > -	void (*query_ras_error_status)(struct amdgpu_device *adev);
> > -	void (*reset_ras_error_status)(struct amdgpu_device *adev);
> > +struct amdgpu_gfx_ras {
> > +	struct amdgpu_ras_block_object  ras_block;
> >  	void (*enable_watchdog_timer)(struct amdgpu_device *adev);  };
> 
> >[Tao] Can we add " enable_watchdog_timer" function into
> amdgpu_ras_block_ops structure?
> >And I think using ras_block directly is more simple than amdgpu_gfx_ras
> gfx_v9_0_ras structure.
> 
> [Thomas] The ' enable_watchdog_timer ' function is not a common function. It
> is only defined by gfx_v9_4_2.c and called in gfx_v9_0.c.
> 	   I think the function pointers in the amdgpu_ras_block_ops structure
> should be the functions used by most blocks and the final goal of
> amdgpu_ras_block_ops structure is to eliminate explicit calls to special blocks in
> amdgpu_ras.c file.
>                  So, I think it had better that the enable_watchdog_timer function only
> use in gfx but not move to amdgpu_ras_block_ops.

[Tao] I know your concern, it's a tradeoff. Take the following code for example, I think struct amdgpu_hdp_ras can be dropped and we can use ras_block directly.

struct amdgpu_hdp_ras hdp_v4_0_ras = {
	.ras_block = {
		.name = "hdp",
		.block = AMDGPU_RAS_BLOCK__HDP,
		.ops = &hdp_v4_0_ras_ops,
	},
};

The struct amdgpu_gfx_ras below can be also discarded if enable_watchdog_timer is moved to amdgpu_ras_block_ops. The current implementation is a little bit complicated.

struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
	.ras_block = {
		.name = "gfx",
		.block = AMDGPU_RAS_BLOCK__GFX,
		.ops = &gfx_v9_4_2_ras_ops,
	},
	.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
};

> 
> >
> > @@ -348,7 +341,7 @@ struct amdgpu_gfx {
> >
> >  	/*ras */
> >  	struct ras_common_if			*ras_if;
> > -	const struct amdgpu_gfx_ras_funcs	*ras_funcs;
> > +	struct amdgpu_gfx_ras	*ras;
> >  };
> >
> >  #define amdgpu_gfx_get_gpu_clock_counter(adev) (adev)->gfx.funcs-
> > >get_gpu_clock_counter((adev))
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > index 1cf1f6331db1..190a4a4e9d7a 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > @@ -862,6 +862,27 @@ static int amdgpu_ras_enable_all_features(struct
> > amdgpu_device *adev,  }
> >  /* feature ctl end */
> >
> > +static struct amdgpu_ras_block_object*
> > +amdgpu_ras_get_ras_block(struct
> > amdgpu_device *adev,
> > +					enum amdgpu_ras_block block,
> > uint32_t sub_block_index) {
> > +	struct amdgpu_ras_block_object *obj, *tmp;
> > +
> > +	if (block >= AMDGPU_RAS_BLOCK__LAST) {
> > +		return NULL;
> > +	}
> >[Tao] The "{}" can be dropped since only one line under the if.
>     [Thomas] OK.
> 
> > +
> > +	list_for_each_entry_safe(obj, tmp, &adev->ras_list, node) {
> > +		if( !obj->ops || !obj->ops->ras_block_match) {
> [Tao]  Need a space after "if" and the space before "!obj" can be removed.
> 
> > +			dev_info(adev->dev, "%s don't config ops or
> > ras_block_match\n", obj->name);
> > +			continue;
> > +		}
> > +		if (!obj->ops->ras_block_match(obj, block, sub_block_index)) {
> > +			return obj;
> > +		}
> >[Tao] The "{}" can be removed.
>       [Thomas] OK.
> 
> > +	}
> > +
> > +	return NULL;
> > +}
> >[Tao] This is a generic ras function, not gfx specific, the code can be moved to
> patch #1.
>     [Thomas] OK.
> >
> >  void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
> >  				       struct ras_common_if *ras_block, @@ -
> > 892,6 +913,7 @@ void amdgpu_ras_mca_query_error_status(struct
> > amdgpu_device *adev,  int amdgpu_ras_query_error_status(struct
> > amdgpu_device *adev,
> >  				  struct ras_query_if *info)
> >  {
> > +	struct amdgpu_ras_block_object* block_obj = NULL;
> >  	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
> >  	struct ras_err_data err_data = {0, 0, 0, NULL};
> >  	int i;
> > @@ -899,6 +921,8 @@ int amdgpu_ras_query_error_status(struct
> > amdgpu_device *adev,
> >  	if (!obj)
> >  		return -EINVAL;
> >
> > +	block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
> > +
> >  	switch (info->head.block) {
> >  	case AMDGPU_RAS_BLOCK__UMC:
> >  		if (adev->umc.ras_funcs &&
> > @@ -919,13 +943,17 @@ int amdgpu_ras_query_error_status(struct
> > amdgpu_device *adev,
> >  		}
> >  		break;
> >  	case AMDGPU_RAS_BLOCK__GFX:
> > -		if (adev->gfx.ras_funcs &&
> > -		    adev->gfx.ras_funcs->query_ras_error_count)
> > -			adev->gfx.ras_funcs->query_ras_error_count(adev,
> > &err_data);
> > +		if (!block_obj || !block_obj->ops)	{
> > +			dev_info(adev->dev, "%s don't config ras function \n",
> > +				get_ras_block_str(&info->head));
> > +			return -EINVAL;
> > +		}
> >[Tao] Can we put the check behind "block_obj = amdgpu_ras_get_ras_block"?
> The same suggestion to all similar code.
>        [Thomas] OK.
> > +
> > +		if (block_obj->ops->query_ras_error_count)
> > +			block_obj->ops->query_ras_error_count(adev,
> > &err_data);
> >
> > -		if (adev->gfx.ras_funcs &&
> > -		    adev->gfx.ras_funcs->query_ras_error_status)
> > -			adev->gfx.ras_funcs->query_ras_error_status(adev);
> > +		if (block_obj->ops->query_ras_error_status)
> > +			block_obj->ops->query_ras_error_status(adev);
> >  		break;
> >  	case AMDGPU_RAS_BLOCK__MMHUB:
> >  		if (adev->mmhub.ras_funcs &&
> > @@ -1012,18 +1040,21 @@ int amdgpu_ras_query_error_status(struct
> > amdgpu_device *adev,  int amdgpu_ras_reset_error_status(struct
> > amdgpu_device *adev,
> >  		enum amdgpu_ras_block block)
> >  {
> > +	struct amdgpu_ras_block_object* block_obj =
> > +amdgpu_ras_get_ras_block(adev, block, 0);
> >  	if (!amdgpu_ras_is_supported(adev, block))
> >  		return -EINVAL;
> >
> >  	switch (block) {
> >  	case AMDGPU_RAS_BLOCK__GFX:
> > -		if (adev->gfx.ras_funcs &&
> > -		    adev->gfx.ras_funcs->reset_ras_error_count)
> > -			adev->gfx.ras_funcs->reset_ras_error_count(adev);
> > +		if (!block_obj || !block_obj->ops)	{
> > +			dev_info(adev->dev, "%s don't config ras function \n",
> > ras_block_str(block));
> > +			return -EINVAL;
> > +		}
> > +		if (block_obj->ops->reset_ras_error_count)
> > +			block_obj->ops->reset_ras_error_count(adev);
> >
> > -		if (adev->gfx.ras_funcs &&
> > -		    adev->gfx.ras_funcs->reset_ras_error_status)
> > -			adev->gfx.ras_funcs->reset_ras_error_status(adev);
> > +		if (block_obj->ops->reset_ras_error_status)
> > +			block_obj->ops->reset_ras_error_status(adev);
> >  		break;
> >  	case AMDGPU_RAS_BLOCK__MMHUB:
> >  		if (adev->mmhub.ras_funcs &&
> > @@ -1088,7 +1119,8 @@ int amdgpu_ras_error_inject(struct amdgpu_device
> > *adev,
> >  		.address = info->address,
> >  		.value = info->value,
> >  	};
> > -	int ret = 0;
> > +	int ret = -EINVAL;
> > +	struct amdgpu_ras_block_object* block_obj =
> > +amdgpu_ras_get_ras_block(adev, info->head.block,
> > +info->head.sub_block_index);
> >
> >  	if (!obj)
> >  		return -EINVAL;
> > @@ -1102,11 +1134,12 @@ int amdgpu_ras_error_inject(struct
> > amdgpu_device *adev,
> >
> >  	switch (info->head.block) {
> >  	case AMDGPU_RAS_BLOCK__GFX:
> > -		if (adev->gfx.ras_funcs &&
> > -		    adev->gfx.ras_funcs->ras_error_inject)
> > -			ret = adev->gfx.ras_funcs->ras_error_inject(adev, info);
> > -		else
> > -			ret = -EINVAL;
> > +		if (!block_obj || !block_obj->ops)	{
> > +			dev_info(adev->dev, "%s don't config ras function \n",
> > get_ras_block_str(&info->head));
> > +			return -EINVAL;
> > +		}
> > +		if (block_obj->ops->ras_error_inject)
> > +			ret = block_obj->ops->ras_error_inject(adev, info);
> >  		break;
> >  	case AMDGPU_RAS_BLOCK__UMC:
> >  	case AMDGPU_RAS_BLOCK__SDMA:
> > @@ -1727,15 +1760,20 @@ static void
> > amdgpu_ras_log_on_err_counter(struct
> > amdgpu_device *adev)  static void amdgpu_ras_error_status_query(struct
> > amdgpu_device *adev,
> >  					  struct ras_query_if *info)
> >  {
> > +	struct amdgpu_ras_block_object* block_obj =
> > +amdgpu_ras_get_ras_block(adev, info->head.block,
> > +info->head.sub_block_index);
> >  	/*
> >  	 * Only two block need to query read/write
> >  	 * RspStatus at current state
> >  	 */
> >  	switch (info->head.block) {
> >  	case AMDGPU_RAS_BLOCK__GFX:
> > -		if (adev->gfx.ras_funcs &&
> > -		    adev->gfx.ras_funcs->query_ras_error_status)
> > -			adev->gfx.ras_funcs->query_ras_error_status(adev);
> > +		if (!block_obj || !block_obj->ops)	{
> > +			dev_info(adev->dev, "%s don't config ras function \n",
> > get_ras_block_str(&info->head));
> > +			return ;
> > +		}
> > +
> > +		if (block_obj->ops->query_ras_error_status)
> > +			block_obj->ops->query_ras_error_status(adev);
> >  		break;
> >  	case AMDGPU_RAS_BLOCK__MMHUB:
> >  		if (adev->mmhub.ras_funcs &&
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > index 08e91e7245df..2ffde223c4f5 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > @@ -817,7 +817,7 @@ static int gfx_v9_0_get_cu_info(struct
> > amdgpu_device *adev,  static uint64_t
> > gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev);  static
> > void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring); static u64
> > gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring); -static int
> > gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
> > +static void gfx_v9_0_query_ras_error_count(struct amdgpu_device
> > +*adev,
> >  					  void *ras_error_status);
> >  static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
> >  				     void *inject_if);
> > @@ -2118,6 +2118,18 @@ static void gfx_v9_0_select_me_pipe_q(struct
> > amdgpu_device *adev,
> >  	soc15_grbm_select(adev, me, pipe, q, vm);  }
> >
> > +static int gfx_v9_0_ras_block_match(struct amdgpu_ras_block_object*
> > +block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index) {
> > +	if(!block_obj)
> > +		return -EINVAL;
> > +
> > +	if(block_obj->block == block) {
> > +		return 0;
> > +	}
> > +
> > +	return -EINVAL;
> >[Tao] The return type can be changed to bool and return value is true or false
> instead of -EINVAL and 0.
>        [Thomas] I think the return type is int maybe have more scalability for a
> unified ops interface.

[Tao] You can use int for the convenience of scalability in the future. -EINVAL means error but it refers to no matched block here, is 0 == "no block" and 1 == "find block" is better?

> 
> > +}
> >[Tao] It's better to implement a general ras block match function in
> >amdgpu_ras.c
>        [Thomas] The match method of mca block is different from other blocks.
> Others blocks only use block to match, but mac block should use block and sub
> block index to match.
> 	          But I can add a default match function in the amdgpu_ras.c, if ip
> block does't define .ras_block_match function, it will use the default match
> function in amdgpu_ras.c.
> > +
> >  static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
> >          .get_gpu_clock_counter = &gfx_v9_0_get_gpu_clock_counter,
> >          .select_se_sh = &gfx_v9_0_select_se_sh, @@ -2127,12 +2139,21
> > @@ static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
> >          .select_me_pipe_q = &gfx_v9_0_select_me_pipe_q,  };
> >
> > -static const struct amdgpu_gfx_ras_funcs gfx_v9_0_ras_funcs = {
> > -	.ras_late_init = amdgpu_gfx_ras_late_init,
> > -	.ras_fini = amdgpu_gfx_ras_fini,
> > -	.ras_error_inject = &gfx_v9_0_ras_error_inject,
> > -	.query_ras_error_count = &gfx_v9_0_query_ras_error_count,
> > -	.reset_ras_error_count = &gfx_v9_0_reset_ras_error_count,
> > +const struct amdgpu_ras_block_ops  gfx_v9_0_ras_ops = {
> 
> >[Tao]  static const?
>     [Thomas] OK.
> > +		.ras_block_match = gfx_v9_0_ras_block_match,
> > +		.ras_late_init = amdgpu_gfx_ras_late_init,
> > +		.ras_fini = amdgpu_gfx_ras_fini,
> > +		.ras_error_inject = &gfx_v9_0_ras_error_inject,
> > +		.query_ras_error_count = &gfx_v9_0_query_ras_error_count,
> > +		.reset_ras_error_count = &gfx_v9_0_reset_ras_error_count, };
> > +
> > +static struct amdgpu_gfx_ras gfx_v9_0_ras = {
> > +	.ras_block = {
> > +		.name = "gfx",
> > +		.block = AMDGPU_RAS_BLOCK__GFX,
> > +		.ops = &gfx_v9_0_ras_ops,
> > +	},
> >  };
> >
> >  static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev) @@
> > -2161,7
> > +2182,7 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device
> > +*adev)
> >  		DRM_INFO("fix gfx.config for vega12\n");
> >  		break;
> >  	case CHIP_VEGA20:
> > -		adev->gfx.ras_funcs = &gfx_v9_0_ras_funcs;
> > +		adev->gfx.ras = &gfx_v9_0_ras;
> >  		adev->gfx.config.max_hw_contexts = 8;
> >  		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
> >  		adev->gfx.config.sc_prim_fifo_size_backend = 0x100; @@ -
> > 2187,7 +2208,7 @@ static int gfx_v9_0_gpu_early_init(struct
> > amdgpu_device
> > *adev)
> >  			gb_addr_config = RAVEN_GB_ADDR_CONFIG_GOLDEN;
> >  		break;
> >  	case CHIP_ARCTURUS:
> > -		adev->gfx.ras_funcs = &gfx_v9_4_ras_funcs;
> > +		adev->gfx.ras = &gfx_v9_4_ras;
> >  		adev->gfx.config.max_hw_contexts = 8;
> >  		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
> >  		adev->gfx.config.sc_prim_fifo_size_backend = 0x100; @@ -
> > 2208,7 +2229,7 @@ static int gfx_v9_0_gpu_early_init(struct
> > amdgpu_device
> > *adev)
> >  		gb_addr_config |= 0x22010042;
> >  		break;
> >  	case CHIP_ALDEBARAN:
> > -		adev->gfx.ras_funcs = &gfx_v9_4_2_ras_funcs;
> > +		adev->gfx.ras = &gfx_v9_4_2_ras;
> >  		adev->gfx.config.max_hw_contexts = 8;
> >  		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
> >  		adev->gfx.config.sc_prim_fifo_size_backend = 0x100; @@ -
> > 2227,6 +2248,14 @@ static int gfx_v9_0_gpu_early_init(struct
> > amdgpu_device
> > *adev)
> >  		break;
> >  	}
> >
> > +	if (adev->gfx.ras) {
> > +		err = amdgpu_ras_register_ras_block(adev, &adev->gfx.ras-
> > >ras_block);
> > +		if (err) {
> > +			DRM_ERROR("Failed to register gfx ras block!\n");
> > +			return err;
> > +		}
> > +	}
> > +
> >  	adev->gfx.config.gb_addr_config = gb_addr_config;
> >
> >  	adev->gfx.config.gb_addr_config_fields.num_pipes = 1 << @@ -2448,9
> > +2477,9 @@ static int gfx_v9_0_sw_fini(void *handle)
> >  	int i;
> >  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> >
> > -	if (adev->gfx.ras_funcs &&
> > -	    adev->gfx.ras_funcs->ras_fini)
> > -		adev->gfx.ras_funcs->ras_fini(adev);
> > +	if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
> > +	    adev->gfx.ras->ras_block.ops->ras_fini)
> > +		adev->gfx.ras->ras_block.ops->ras_fini(adev);
> >
> >  	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
> >  		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
> > @@ -4888,16 +4917,16 @@ static int gfx_v9_0_ecc_late_init(void *handle)
> >  	if (r)
> >  		return r;
> >
> > -	if (adev->gfx.ras_funcs &&
> > -	    adev->gfx.ras_funcs->ras_late_init) {
> > -		r = adev->gfx.ras_funcs->ras_late_init(adev);
> > +	if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
> > +	    adev->gfx.ras->ras_block.ops->ras_late_init) {
> > +		r = adev->gfx.ras->ras_block.ops->ras_late_init(adev);
> >  		if (r)
> >  			return r;
> >  	}
> >
> > -	if (adev->gfx.ras_funcs &&
> > -	    adev->gfx.ras_funcs->enable_watchdog_timer)
> > -		adev->gfx.ras_funcs->enable_watchdog_timer(adev);
> > +	if (adev->gfx.ras &&
> > +	    adev->gfx.ras->enable_watchdog_timer)
> > +		adev->gfx.ras->enable_watchdog_timer(adev);
> >
> >  	return 0;
> >  }
> > @@ -6841,7 +6870,7 @@ static void
> > gfx_v9_0_reset_ras_error_count(struct
> > amdgpu_device *adev)
> >  	WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_INDEX, 255);  }
> >
> > -static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
> > +static void gfx_v9_0_query_ras_error_count(struct amdgpu_device
> > +*adev,
> >  					  void *ras_error_status)
> >  {
> >  	struct ras_err_data *err_data = (struct ras_err_data
> > *)ras_error_status; @@ -6850,7 +6879,7 @@ static int
> > gfx_v9_0_query_ras_error_count(struct
> > amdgpu_device *adev,
> >  	uint32_t reg_value;
> >
> >  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
> > -		return -EINVAL;
> > +		return;
> >
> >  	err_data->ue_count = 0;
> >  	err_data->ce_count = 0;
> > @@ -6879,8 +6908,6 @@ static int gfx_v9_0_query_ras_error_count(struct
> > amdgpu_device *adev,
> >  	mutex_unlock(&adev->grbm_idx_mutex);
> >
> >  	gfx_v9_0_query_utc_edc_status(adev, err_data);
> > -
> > -	return 0;
> >  }
> >
> >  static void gfx_v9_0_emit_mem_sync(struct amdgpu_ring *ring) diff
> > --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> > index b4789dfc2bb9..2d816addbd4d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> > @@ -863,7 +863,7 @@ static int gfx_v9_4_ras_error_count(struct
> > amdgpu_device *adev,
> >  	return 0;
> >  }
> >
> > -static int gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
> > +static void gfx_v9_4_query_ras_error_count(struct amdgpu_device
> > +*adev,
> >  					  void *ras_error_status)
> >  {
> >  	struct ras_err_data *err_data = (struct ras_err_data
> > *)ras_error_status; @@ -872,7 +872,7 @@ static int
> > gfx_v9_4_query_ras_error_count(struct
> > amdgpu_device *adev,
> >  	uint32_t reg_value;
> >
> >  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
> > -		return -EINVAL;
> > +		return;
> >
> >  	err_data->ue_count = 0;
> >  	err_data->ce_count = 0;
> > @@ -903,7 +903,6 @@ static int gfx_v9_4_query_ras_error_count(struct
> > amdgpu_device *adev,
> >
> >  	gfx_v9_4_query_utc_edc_status(adev, err_data);
> >
> > -	return 0;
> >  }
> >
> >  static void gfx_v9_4_reset_ras_error_count(struct amdgpu_device
> > *adev) @@
> > -1029,11 +1028,31 @@ static void
> > gfx_v9_4_query_ras_error_status(struct
> > amdgpu_device *adev)
> >  	mutex_unlock(&adev->grbm_idx_mutex);
> >  }
> >
> > -const struct amdgpu_gfx_ras_funcs gfx_v9_4_ras_funcs = {
> > -        .ras_late_init = amdgpu_gfx_ras_late_init,
> > -        .ras_fini = amdgpu_gfx_ras_fini,
> > -        .ras_error_inject = &gfx_v9_4_ras_error_inject,
> > -        .query_ras_error_count = &gfx_v9_4_query_ras_error_count,
> > -        .reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
> > -        .query_ras_error_status = &gfx_v9_4_query_ras_error_status,
> > +static int gfx_v9_4_ras_block_match(struct amdgpu_ras_block_object*
> > +block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index) {
> > +	if(!block_obj)
> > +		return -EINVAL;
> > +
> > +	if(block_obj->block == block) {
> > +		return 0;
> > +	}
> > +	return -EINVAL;
> > +}
> > +
> > +const struct amdgpu_ras_block_ops  gfx_v9_4_ras_ops = {
> > +	.ras_block_match = gfx_v9_4_ras_block_match,
> > +	.ras_late_init = amdgpu_gfx_ras_late_init,
> > +	.ras_fini = amdgpu_gfx_ras_fini,
> > +	.ras_error_inject = &gfx_v9_4_ras_error_inject,
> > +	.query_ras_error_count = &gfx_v9_4_query_ras_error_count,
> > +	.reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
> > +	.query_ras_error_status = &gfx_v9_4_query_ras_error_status, };
> > +
> > +struct amdgpu_gfx_ras gfx_v9_4_ras = {
> > +	.ras_block = {
> > +		.name = "gfx",
> > +		.block = AMDGPU_RAS_BLOCK__GFX,
> > +		.ops = &gfx_v9_4_ras_ops,
> > +	},
> >  };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> > index bdd16b568021..ca520a767267 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> > @@ -24,6 +24,6 @@
> >  #ifndef __GFX_V9_4_H__
> >  #define __GFX_V9_4_H__
> >
> > -extern const struct amdgpu_gfx_ras_funcs gfx_v9_4_ras_funcs;
> > +extern struct amdgpu_gfx_ras gfx_v9_4_ras;
> >
> >  #endif /* __GFX_V9_4_H__ */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> > index 54306fd45ff1..2744709fa09d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> > @@ -1644,14 +1644,14 @@ static int
> > gfx_v9_4_2_query_utc_edc_count(struct
> > amdgpu_device *adev,
> >  	return 0;
> >  }
> >
> > -static int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device
> > *adev,
> > +static void gfx_v9_4_2_query_ras_error_count(struct amdgpu_device
> > +*adev,
> >  					    void *ras_error_status)
> >  {
> >  	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
> >  	uint32_t sec_count = 0, ded_count = 0;
> >
> >  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
> > -		return -EINVAL;
> > +		return;
> >
> >  	err_data->ue_count = 0;
> >  	err_data->ce_count = 0;
> > @@ -1664,7 +1664,6 @@ static int
> > gfx_v9_4_2_query_ras_error_count(struct
> > amdgpu_device *adev,
> >  	err_data->ce_count += sec_count;
> >  	err_data->ue_count += ded_count;
> >
> > -	return 0;
> >  }
> >
> >  static void gfx_v9_4_2_reset_utc_err_status(struct amdgpu_device
> > *adev) @@
> > -1934,13 +1933,34 @@ static void
> > gfx_v9_4_2_reset_sq_timeout_status(struct
> > amdgpu_device *adev)
> >  	mutex_unlock(&adev->grbm_idx_mutex);
> >  }
> >
> > -const struct amdgpu_gfx_ras_funcs gfx_v9_4_2_ras_funcs = {
> > -	.ras_late_init = amdgpu_gfx_ras_late_init,
> > -	.ras_fini = amdgpu_gfx_ras_fini,
> > -	.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
> > -	.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
> > -	.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
> > -	.query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
> > -	.reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status,
> > +static int gfx_v9_4_2_ras_block_match(struct amdgpu_ras_block_object*
> > +block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index) {
> > +	if(!block_obj)
> > +		return -EINVAL;
> > +
> > +	if(block_obj->block == block) {
> > +		return 0;
> > +	}
> > +
> > +	return -EINVAL;
> > +}
> > +
> > +struct amdgpu_ras_block_ops  gfx_v9_4_2_ras_ops ={
> > +		.ras_block_match = gfx_v9_4_2_ras_block_match,
> > +		.ras_late_init = amdgpu_gfx_ras_late_init,
> > +		.ras_fini = amdgpu_gfx_ras_fini,
> > +		.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
> > +		.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
> > +		.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
> > +		.query_ras_error_status =
> > &gfx_v9_4_2_query_ras_error_status,
> > +		.reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status,
> > +};
> > +
> > +struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
> > +	.ras_block = {
> > +		.name = "gfx",
> > +		.block = AMDGPU_RAS_BLOCK__GFX,
> > +		.ops = &gfx_v9_4_2_ras_ops,
> > +	},
> >  	.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
> >  };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> > index 6db1f88509af..7584624b641c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> > @@ -31,6 +31,6 @@ void gfx_v9_4_2_init_golden_registers(struct
> > amdgpu_device *adev,  void gfx_v9_4_2_set_power_brake_sequence(struct
> > amdgpu_device *adev);  int gfx_v9_4_2_do_edc_gpr_workarounds(struct
> > amdgpu_device *adev);
> >
> > -extern const struct amdgpu_gfx_ras_funcs gfx_v9_4_2_ras_funcs;
> > +extern struct amdgpu_gfx_ras gfx_v9_4_2_ras;
> >
> >  #endif /* __GFX_V9_4_2_H__ */
> > --
> > 2.25.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the unified ras block data and ops
  2021-12-07  4:06       ` Zhou1, Tao
@ 2021-12-07  6:31         ` Chai, Thomas
  0 siblings, 0 replies; 20+ messages in thread
From: Chai, Thomas @ 2021-12-07  6:31 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx; +Cc: Zhang, Hawking

Hi tao:
     Thanks for your review. I  add another two comments behind your comments, please review again.

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com> 
Sent: Tuesday, December 7, 2021 12:07 PM
To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: RE: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the unified ras block data and ops

[AMD Official Use Only]

Hi Thomas,

Please see my two comments.

Regards,
Tao

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Tuesday, December 7, 2021 11:37 AM
> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: RE: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for 
> the unified ras block data and ops
> 
> Hi tao:
>      I add my comments behind your comments. Please review.
> 
> -----Original Message-----
> From: Zhou1, Tao <Tao.Zhou1@amd.com>
> Sent: Monday, December 6, 2021 2:58 PM
> To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: RE: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for 
> the unified ras block data and ops
> 
> [AMD Official Use Only]
> 
> Please see my comments inline.
> 
> > -----Original Message-----
> > From: Chai, Thomas <YiPeng.Chai@amd.com>
> > Sent: Wednesday, December 1, 2021 6:53 PM
> > To: amd-gfx@lists.freedesktop.org
> > Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking 
> > <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Chai,
> Thomas
> > <YiPeng.Chai@amd.com>
> > Subject: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for 
> > the unified ras block data and ops
> >
> > 1.Modify gfx block to fit for the unified ras block data and ops 
> > 2.Implement .ras_block_match function pointer for gfx block to identify itself.
> > 3.Change amdgpu_gfx_ras_funcs to amdgpu_gfx_ras, and the 
> > corresponding variable name remove _funcs suffix.
> > 4.Remove the const flag of gfx ras variable so that gfx ras block 
> > can be able to be insertted into amdgpu device ras block link list.
> > 5.Invoke amdgpu_ras_register_ras_block function to register gfx ras 
> > block into amdgpu device ras block link list.
> > 6.Remove the redundant code about gfx in amdgpu_ras.c after using 
> > the unified ras block.
> >
> > Signed-off-by: yipechai <YiPeng.Chai@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  6 +- 
> > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 15 ++--- 
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 80 
> > ++++++++++++++++++------
> -
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 73 +++++++++++++++-------
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c   | 39 ++++++++----
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h   |  2 +-
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 42 +++++++++---- 
> > drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h |  2 +-
> >  8 files changed, 178 insertions(+), 81 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > index 1795d448c700..da8691259ac1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > @@ -696,9 +696,9 @@ int amdgpu_gfx_process_ras_data_cb(struct
> > amdgpu_device *adev,
> >  	 */
> >  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
> >  		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> > -		if (adev->gfx.ras_funcs &&
> > -		    adev->gfx.ras_funcs->query_ras_error_count)
> > -			adev->gfx.ras_funcs->query_ras_error_count(adev,
> > err_data);
> > +		if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
> > +		    adev->gfx.ras->ras_block.ops->query_ras_error_count)
> > +			adev->gfx.ras->ras_block.ops-
> > >query_ras_error_count(adev, err_data);
> >  		amdgpu_ras_reset_gpu(adev);
> >  	}
> >  	return AMDGPU_RAS_SUCCESS;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > index 6b78b4a0e182..ff4a8428a84b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > @@ -31,6 +31,7 @@
> >  #include "amdgpu_ring.h"
> >  #include "amdgpu_rlc.h"
> >  #include "soc15.h"
> > +#include "amdgpu_ras.h"
> >
> >  /* GFX current status */
> >  #define AMDGPU_GFX_NORMAL_MODE			0x00000000L
> > @@ -213,16 +214,8 @@ struct amdgpu_cu_info {
> >  	uint32_t bitmap[4][4];
> >  };
> >
> > -struct amdgpu_gfx_ras_funcs {
> > -	int (*ras_late_init)(struct amdgpu_device *adev);
> > -	void (*ras_fini)(struct amdgpu_device *adev);
> > -	int (*ras_error_inject)(struct amdgpu_device *adev,
> > -				void *inject_if);
> > -	int (*query_ras_error_count)(struct amdgpu_device *adev,
> > -				     void *ras_error_status);
> > -	void (*reset_ras_error_count)(struct amdgpu_device *adev);
> > -	void (*query_ras_error_status)(struct amdgpu_device *adev);
> > -	void (*reset_ras_error_status)(struct amdgpu_device *adev);
> > +struct amdgpu_gfx_ras {
> > +	struct amdgpu_ras_block_object  ras_block;
> >  	void (*enable_watchdog_timer)(struct amdgpu_device *adev);  };
> 
> >[Tao] Can we add " enable_watchdog_timer" function into
> amdgpu_ras_block_ops structure?
> >And I think using ras_block directly is more simple than 
> >amdgpu_gfx_ras
> gfx_v9_0_ras structure.
> 
> [Thomas] The ' enable_watchdog_timer ' function is not a common 
> function. It is only defined by gfx_v9_4_2.c and called in gfx_v9_0.c.
> 	   I think the function pointers in the amdgpu_ras_block_ops 
> structure should be the functions used by most blocks and the final 
> goal of amdgpu_ras_block_ops structure is to eliminate explicit calls 
> to special blocks in amdgpu_ras.c file.
>                  So, I think it had better that the 
> enable_watchdog_timer function only use in gfx but not move to amdgpu_ras_block_ops.

>[Tao] I know your concern, it's a tradeoff. Take the following code for example, I think struct amdgpu_hdp_ras can be dropped and we can use ras_block directly.

>struct amdgpu_hdp_ras hdp_v4_0_ras = {
>	.ras_block = {
>		.name = "hdp",
>		.block = AMDGPU_RAS_BLOCK__HDP,
>		.ops = &hdp_v4_0_ras_ops,
>	},
>};

>The struct amdgpu_gfx_ras below can be also discarded if enable_watchdog_timer is moved to amdgpu_ras_block_ops. The current implementation is a little bit complicated.

>struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
>	.ras_block = {
>		.name = "gfx",
>		.block = AMDGPU_RAS_BLOCK__GFX,
>		.ops = &gfx_v9_4_2_ras_ops,
>	},
>	.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
>};

      [Thomas] I understand what your mean.  But if we still have the possibility to add new ras functions for these blocks in the further, I think using ip block's ras structure to wrap the amdgpu_ras_block_object structure may be a better practice.  The wrapper can reduce code modification when adding a new ras function in the further. 
	

> 
> >
> > @@ -348,7 +341,7 @@ struct amdgpu_gfx {
> >
> >  	/*ras */
> >  	struct ras_common_if			*ras_if;
> > -	const struct amdgpu_gfx_ras_funcs	*ras_funcs;
> > +	struct amdgpu_gfx_ras	*ras;
> >  };
> >
> >  #define amdgpu_gfx_get_gpu_clock_counter(adev) (adev)->gfx.funcs-
> > >get_gpu_clock_counter((adev))
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > index 1cf1f6331db1..190a4a4e9d7a 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > @@ -862,6 +862,27 @@ static int 
> > amdgpu_ras_enable_all_features(struct
> > amdgpu_device *adev,  }
> >  /* feature ctl end */
> >
> > +static struct amdgpu_ras_block_object* 
> > +amdgpu_ras_get_ras_block(struct
> > amdgpu_device *adev,
> > +					enum amdgpu_ras_block block,
> > uint32_t sub_block_index) {
> > +	struct amdgpu_ras_block_object *obj, *tmp;
> > +
> > +	if (block >= AMDGPU_RAS_BLOCK__LAST) {
> > +		return NULL;
> > +	}
> >[Tao] The "{}" can be dropped since only one line under the if.
>     [Thomas] OK.
> 
> > +
> > +	list_for_each_entry_safe(obj, tmp, &adev->ras_list, node) {
> > +		if( !obj->ops || !obj->ops->ras_block_match) {
> [Tao]  Need a space after "if" and the space before "!obj" can be removed.
> 
> > +			dev_info(adev->dev, "%s don't config ops or
> > ras_block_match\n", obj->name);
> > +			continue;
> > +		}
> > +		if (!obj->ops->ras_block_match(obj, block, sub_block_index)) {
> > +			return obj;
> > +		}
> >[Tao] The "{}" can be removed.
>       [Thomas] OK.
> 
> > +	}
> > +
> > +	return NULL;
> > +}
> >[Tao] This is a generic ras function, not gfx specific, the code can 
> >be moved to
> patch #1.
>     [Thomas] OK.
> >
> >  void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
> >  				       struct ras_common_if *ras_block, @@ -
> > 892,6 +913,7 @@ void amdgpu_ras_mca_query_error_status(struct
> > amdgpu_device *adev,  int amdgpu_ras_query_error_status(struct
> > amdgpu_device *adev,
> >  				  struct ras_query_if *info)
> >  {
> > +	struct amdgpu_ras_block_object* block_obj = NULL;
> >  	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
> >  	struct ras_err_data err_data = {0, 0, 0, NULL};
> >  	int i;
> > @@ -899,6 +921,8 @@ int amdgpu_ras_query_error_status(struct
> > amdgpu_device *adev,
> >  	if (!obj)
> >  		return -EINVAL;
> >
> > +	block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
> > +
> >  	switch (info->head.block) {
> >  	case AMDGPU_RAS_BLOCK__UMC:
> >  		if (adev->umc.ras_funcs &&
> > @@ -919,13 +943,17 @@ int amdgpu_ras_query_error_status(struct
> > amdgpu_device *adev,
> >  		}
> >  		break;
> >  	case AMDGPU_RAS_BLOCK__GFX:
> > -		if (adev->gfx.ras_funcs &&
> > -		    adev->gfx.ras_funcs->query_ras_error_count)
> > -			adev->gfx.ras_funcs->query_ras_error_count(adev,
> > &err_data);
> > +		if (!block_obj || !block_obj->ops)	{
> > +			dev_info(adev->dev, "%s don't config ras function \n",
> > +				get_ras_block_str(&info->head));
> > +			return -EINVAL;
> > +		}
> >[Tao] Can we put the check behind "block_obj = amdgpu_ras_get_ras_block"?
> The same suggestion to all similar code.
>        [Thomas] OK.
> > +
> > +		if (block_obj->ops->query_ras_error_count)
> > +			block_obj->ops->query_ras_error_count(adev,
> > &err_data);
> >
> > -		if (adev->gfx.ras_funcs &&
> > -		    adev->gfx.ras_funcs->query_ras_error_status)
> > -			adev->gfx.ras_funcs->query_ras_error_status(adev);
> > +		if (block_obj->ops->query_ras_error_status)
> > +			block_obj->ops->query_ras_error_status(adev);
> >  		break;
> >  	case AMDGPU_RAS_BLOCK__MMHUB:
> >  		if (adev->mmhub.ras_funcs &&
> > @@ -1012,18 +1040,21 @@ int amdgpu_ras_query_error_status(struct
> > amdgpu_device *adev,  int amdgpu_ras_reset_error_status(struct
> > amdgpu_device *adev,
> >  		enum amdgpu_ras_block block)
> >  {
> > +	struct amdgpu_ras_block_object* block_obj = 
> > +amdgpu_ras_get_ras_block(adev, block, 0);
> >  	if (!amdgpu_ras_is_supported(adev, block))
> >  		return -EINVAL;
> >
> >  	switch (block) {
> >  	case AMDGPU_RAS_BLOCK__GFX:
> > -		if (adev->gfx.ras_funcs &&
> > -		    adev->gfx.ras_funcs->reset_ras_error_count)
> > -			adev->gfx.ras_funcs->reset_ras_error_count(adev);
> > +		if (!block_obj || !block_obj->ops)	{
> > +			dev_info(adev->dev, "%s don't config ras function \n",
> > ras_block_str(block));
> > +			return -EINVAL;
> > +		}
> > +		if (block_obj->ops->reset_ras_error_count)
> > +			block_obj->ops->reset_ras_error_count(adev);
> >
> > -		if (adev->gfx.ras_funcs &&
> > -		    adev->gfx.ras_funcs->reset_ras_error_status)
> > -			adev->gfx.ras_funcs->reset_ras_error_status(adev);
> > +		if (block_obj->ops->reset_ras_error_status)
> > +			block_obj->ops->reset_ras_error_status(adev);
> >  		break;
> >  	case AMDGPU_RAS_BLOCK__MMHUB:
> >  		if (adev->mmhub.ras_funcs &&
> > @@ -1088,7 +1119,8 @@ int amdgpu_ras_error_inject(struct 
> > amdgpu_device *adev,
> >  		.address = info->address,
> >  		.value = info->value,
> >  	};
> > -	int ret = 0;
> > +	int ret = -EINVAL;
> > +	struct amdgpu_ras_block_object* block_obj = 
> > +amdgpu_ras_get_ras_block(adev, info->head.block,
> > +info->head.sub_block_index);
> >
> >  	if (!obj)
> >  		return -EINVAL;
> > @@ -1102,11 +1134,12 @@ int amdgpu_ras_error_inject(struct 
> > amdgpu_device *adev,
> >
> >  	switch (info->head.block) {
> >  	case AMDGPU_RAS_BLOCK__GFX:
> > -		if (adev->gfx.ras_funcs &&
> > -		    adev->gfx.ras_funcs->ras_error_inject)
> > -			ret = adev->gfx.ras_funcs->ras_error_inject(adev, info);
> > -		else
> > -			ret = -EINVAL;
> > +		if (!block_obj || !block_obj->ops)	{
> > +			dev_info(adev->dev, "%s don't config ras function \n",
> > get_ras_block_str(&info->head));
> > +			return -EINVAL;
> > +		}
> > +		if (block_obj->ops->ras_error_inject)
> > +			ret = block_obj->ops->ras_error_inject(adev, info);
> >  		break;
> >  	case AMDGPU_RAS_BLOCK__UMC:
> >  	case AMDGPU_RAS_BLOCK__SDMA:
> > @@ -1727,15 +1760,20 @@ static void
> > amdgpu_ras_log_on_err_counter(struct
> > amdgpu_device *adev)  static void 
> > amdgpu_ras_error_status_query(struct
> > amdgpu_device *adev,
> >  					  struct ras_query_if *info)
> >  {
> > +	struct amdgpu_ras_block_object* block_obj = 
> > +amdgpu_ras_get_ras_block(adev, info->head.block,
> > +info->head.sub_block_index);
> >  	/*
> >  	 * Only two block need to query read/write
> >  	 * RspStatus at current state
> >  	 */
> >  	switch (info->head.block) {
> >  	case AMDGPU_RAS_BLOCK__GFX:
> > -		if (adev->gfx.ras_funcs &&
> > -		    adev->gfx.ras_funcs->query_ras_error_status)
> > -			adev->gfx.ras_funcs->query_ras_error_status(adev);
> > +		if (!block_obj || !block_obj->ops)	{
> > +			dev_info(adev->dev, "%s don't config ras function \n",
> > get_ras_block_str(&info->head));
> > +			return ;
> > +		}
> > +
> > +		if (block_obj->ops->query_ras_error_status)
> > +			block_obj->ops->query_ras_error_status(adev);
> >  		break;
> >  	case AMDGPU_RAS_BLOCK__MMHUB:
> >  		if (adev->mmhub.ras_funcs &&
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > index 08e91e7245df..2ffde223c4f5 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > @@ -817,7 +817,7 @@ static int gfx_v9_0_get_cu_info(struct 
> > amdgpu_device *adev,  static uint64_t 
> > gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev);  static 
> > void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring); static 
> > u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring); 
> > -static int gfx_v9_0_query_ras_error_count(struct amdgpu_device 
> > *adev,
> > +static void gfx_v9_0_query_ras_error_count(struct amdgpu_device 
> > +*adev,
> >  					  void *ras_error_status);
> >  static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
> >  				     void *inject_if);
> > @@ -2118,6 +2118,18 @@ static void gfx_v9_0_select_me_pipe_q(struct 
> > amdgpu_device *adev,
> >  	soc15_grbm_select(adev, me, pipe, q, vm);  }
> >
> > +static int gfx_v9_0_ras_block_match(struct amdgpu_ras_block_object* 
> > +block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index) {
> > +	if(!block_obj)
> > +		return -EINVAL;
> > +
> > +	if(block_obj->block == block) {
> > +		return 0;
> > +	}
> > +
> > +	return -EINVAL;
> >[Tao] The return type can be changed to bool and return value is true 
> >or false
> instead of -EINVAL and 0.
>        [Thomas] I think the return type is int maybe have more 
> scalability for a unified ops interface.

>[Tao] You can use int for the convenience of scalability in the future. -EINVAL means error but it refers to no matched block here, is 0 == "no block" and 1 == "find block" is better?

      [Thomas] In the linux kernel , the function usually return 0 for success,  return less than 0 for fail.  We better follow this rule.

> 
> > +}
> >[Tao] It's better to implement a general ras block match function in 
> >amdgpu_ras.c
>        [Thomas] The match method of mca block is different from other blocks.
> Others blocks only use block to match, but mac block should use block 
> and sub block index to match.
> 	          But I can add a default match function in the amdgpu_ras.c, 
> if ip block does't define .ras_block_match function, it will use the 
> default match function in amdgpu_ras.c.
> > +
> >  static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
> >          .get_gpu_clock_counter = &gfx_v9_0_get_gpu_clock_counter,
> >          .select_se_sh = &gfx_v9_0_select_se_sh, @@ -2127,12 
> > +2139,21 @@ static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
> >          .select_me_pipe_q = &gfx_v9_0_select_me_pipe_q,  };
> >
> > -static const struct amdgpu_gfx_ras_funcs gfx_v9_0_ras_funcs = {
> > -	.ras_late_init = amdgpu_gfx_ras_late_init,
> > -	.ras_fini = amdgpu_gfx_ras_fini,
> > -	.ras_error_inject = &gfx_v9_0_ras_error_inject,
> > -	.query_ras_error_count = &gfx_v9_0_query_ras_error_count,
> > -	.reset_ras_error_count = &gfx_v9_0_reset_ras_error_count,
> > +const struct amdgpu_ras_block_ops  gfx_v9_0_ras_ops = {
> 
> >[Tao]  static const?
>     [Thomas] OK.
> > +		.ras_block_match = gfx_v9_0_ras_block_match,
> > +		.ras_late_init = amdgpu_gfx_ras_late_init,
> > +		.ras_fini = amdgpu_gfx_ras_fini,
> > +		.ras_error_inject = &gfx_v9_0_ras_error_inject,
> > +		.query_ras_error_count = &gfx_v9_0_query_ras_error_count,
> > +		.reset_ras_error_count = &gfx_v9_0_reset_ras_error_count, };
> > +
> > +static struct amdgpu_gfx_ras gfx_v9_0_ras = {
> > +	.ras_block = {
> > +		.name = "gfx",
> > +		.block = AMDGPU_RAS_BLOCK__GFX,
> > +		.ops = &gfx_v9_0_ras_ops,
> > +	},
> >  };
> >
> >  static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev) @@
> > -2161,7
> > +2182,7 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device
> > +*adev)
> >  		DRM_INFO("fix gfx.config for vega12\n");
> >  		break;
> >  	case CHIP_VEGA20:
> > -		adev->gfx.ras_funcs = &gfx_v9_0_ras_funcs;
> > +		adev->gfx.ras = &gfx_v9_0_ras;
> >  		adev->gfx.config.max_hw_contexts = 8;
> >  		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
> >  		adev->gfx.config.sc_prim_fifo_size_backend = 0x100; @@ -
> > 2187,7 +2208,7 @@ static int gfx_v9_0_gpu_early_init(struct 
> > amdgpu_device
> > *adev)
> >  			gb_addr_config = RAVEN_GB_ADDR_CONFIG_GOLDEN;
> >  		break;
> >  	case CHIP_ARCTURUS:
> > -		adev->gfx.ras_funcs = &gfx_v9_4_ras_funcs;
> > +		adev->gfx.ras = &gfx_v9_4_ras;
> >  		adev->gfx.config.max_hw_contexts = 8;
> >  		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
> >  		adev->gfx.config.sc_prim_fifo_size_backend = 0x100; @@ -
> > 2208,7 +2229,7 @@ static int gfx_v9_0_gpu_early_init(struct 
> > amdgpu_device
> > *adev)
> >  		gb_addr_config |= 0x22010042;
> >  		break;
> >  	case CHIP_ALDEBARAN:
> > -		adev->gfx.ras_funcs = &gfx_v9_4_2_ras_funcs;
> > +		adev->gfx.ras = &gfx_v9_4_2_ras;
> >  		adev->gfx.config.max_hw_contexts = 8;
> >  		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
> >  		adev->gfx.config.sc_prim_fifo_size_backend = 0x100; @@ -
> > 2227,6 +2248,14 @@ static int gfx_v9_0_gpu_early_init(struct 
> > amdgpu_device
> > *adev)
> >  		break;
> >  	}
> >
> > +	if (adev->gfx.ras) {
> > +		err = amdgpu_ras_register_ras_block(adev, &adev->gfx.ras-
> > >ras_block);
> > +		if (err) {
> > +			DRM_ERROR("Failed to register gfx ras block!\n");
> > +			return err;
> > +		}
> > +	}
> > +
> >  	adev->gfx.config.gb_addr_config = gb_addr_config;
> >
> >  	adev->gfx.config.gb_addr_config_fields.num_pipes = 1 << @@ -2448,9
> > +2477,9 @@ static int gfx_v9_0_sw_fini(void *handle)
> >  	int i;
> >  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> >
> > -	if (adev->gfx.ras_funcs &&
> > -	    adev->gfx.ras_funcs->ras_fini)
> > -		adev->gfx.ras_funcs->ras_fini(adev);
> > +	if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
> > +	    adev->gfx.ras->ras_block.ops->ras_fini)
> > +		adev->gfx.ras->ras_block.ops->ras_fini(adev);
> >
> >  	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
> >  		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
> > @@ -4888,16 +4917,16 @@ static int gfx_v9_0_ecc_late_init(void *handle)
> >  	if (r)
> >  		return r;
> >
> > -	if (adev->gfx.ras_funcs &&
> > -	    adev->gfx.ras_funcs->ras_late_init) {
> > -		r = adev->gfx.ras_funcs->ras_late_init(adev);
> > +	if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
> > +	    adev->gfx.ras->ras_block.ops->ras_late_init) {
> > +		r = adev->gfx.ras->ras_block.ops->ras_late_init(adev);
> >  		if (r)
> >  			return r;
> >  	}
> >
> > -	if (adev->gfx.ras_funcs &&
> > -	    adev->gfx.ras_funcs->enable_watchdog_timer)
> > -		adev->gfx.ras_funcs->enable_watchdog_timer(adev);
> > +	if (adev->gfx.ras &&
> > +	    adev->gfx.ras->enable_watchdog_timer)
> > +		adev->gfx.ras->enable_watchdog_timer(adev);
> >
> >  	return 0;
> >  }
> > @@ -6841,7 +6870,7 @@ static void
> > gfx_v9_0_reset_ras_error_count(struct
> > amdgpu_device *adev)
> >  	WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_INDEX, 255);  }
> >
> > -static int gfx_v9_0_query_ras_error_count(struct amdgpu_device 
> > *adev,
> > +static void gfx_v9_0_query_ras_error_count(struct amdgpu_device 
> > +*adev,
> >  					  void *ras_error_status)
> >  {
> >  	struct ras_err_data *err_data = (struct ras_err_data 
> > *)ras_error_status; @@ -6850,7 +6879,7 @@ static int 
> > gfx_v9_0_query_ras_error_count(struct
> > amdgpu_device *adev,
> >  	uint32_t reg_value;
> >
> >  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
> > -		return -EINVAL;
> > +		return;
> >
> >  	err_data->ue_count = 0;
> >  	err_data->ce_count = 0;
> > @@ -6879,8 +6908,6 @@ static int 
> > gfx_v9_0_query_ras_error_count(struct
> > amdgpu_device *adev,
> >  	mutex_unlock(&adev->grbm_idx_mutex);
> >
> >  	gfx_v9_0_query_utc_edc_status(adev, err_data);
> > -
> > -	return 0;
> >  }
> >
> >  static void gfx_v9_0_emit_mem_sync(struct amdgpu_ring *ring) diff 
> > --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> > index b4789dfc2bb9..2d816addbd4d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> > @@ -863,7 +863,7 @@ static int gfx_v9_4_ras_error_count(struct 
> > amdgpu_device *adev,
> >  	return 0;
> >  }
> >
> > -static int gfx_v9_4_query_ras_error_count(struct amdgpu_device 
> > *adev,
> > +static void gfx_v9_4_query_ras_error_count(struct amdgpu_device 
> > +*adev,
> >  					  void *ras_error_status)
> >  {
> >  	struct ras_err_data *err_data = (struct ras_err_data 
> > *)ras_error_status; @@ -872,7 +872,7 @@ static int 
> > gfx_v9_4_query_ras_error_count(struct
> > amdgpu_device *adev,
> >  	uint32_t reg_value;
> >
> >  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
> > -		return -EINVAL;
> > +		return;
> >
> >  	err_data->ue_count = 0;
> >  	err_data->ce_count = 0;
> > @@ -903,7 +903,6 @@ static int gfx_v9_4_query_ras_error_count(struct
> > amdgpu_device *adev,
> >
> >  	gfx_v9_4_query_utc_edc_status(adev, err_data);
> >
> > -	return 0;
> >  }
> >
> >  static void gfx_v9_4_reset_ras_error_count(struct amdgpu_device
> > *adev) @@
> > -1029,11 +1028,31 @@ static void
> > gfx_v9_4_query_ras_error_status(struct
> > amdgpu_device *adev)
> >  	mutex_unlock(&adev->grbm_idx_mutex);
> >  }
> >
> > -const struct amdgpu_gfx_ras_funcs gfx_v9_4_ras_funcs = {
> > -        .ras_late_init = amdgpu_gfx_ras_late_init,
> > -        .ras_fini = amdgpu_gfx_ras_fini,
> > -        .ras_error_inject = &gfx_v9_4_ras_error_inject,
> > -        .query_ras_error_count = &gfx_v9_4_query_ras_error_count,
> > -        .reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
> > -        .query_ras_error_status = &gfx_v9_4_query_ras_error_status,
> > +static int gfx_v9_4_ras_block_match(struct amdgpu_ras_block_object* 
> > +block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index) {
> > +	if(!block_obj)
> > +		return -EINVAL;
> > +
> > +	if(block_obj->block == block) {
> > +		return 0;
> > +	}
> > +	return -EINVAL;
> > +}
> > +
> > +const struct amdgpu_ras_block_ops  gfx_v9_4_ras_ops = {
> > +	.ras_block_match = gfx_v9_4_ras_block_match,
> > +	.ras_late_init = amdgpu_gfx_ras_late_init,
> > +	.ras_fini = amdgpu_gfx_ras_fini,
> > +	.ras_error_inject = &gfx_v9_4_ras_error_inject,
> > +	.query_ras_error_count = &gfx_v9_4_query_ras_error_count,
> > +	.reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
> > +	.query_ras_error_status = &gfx_v9_4_query_ras_error_status, };
> > +
> > +struct amdgpu_gfx_ras gfx_v9_4_ras = {
> > +	.ras_block = {
> > +		.name = "gfx",
> > +		.block = AMDGPU_RAS_BLOCK__GFX,
> > +		.ops = &gfx_v9_4_ras_ops,
> > +	},
> >  };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> > index bdd16b568021..ca520a767267 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> > @@ -24,6 +24,6 @@
> >  #ifndef __GFX_V9_4_H__
> >  #define __GFX_V9_4_H__
> >
> > -extern const struct amdgpu_gfx_ras_funcs gfx_v9_4_ras_funcs;
> > +extern struct amdgpu_gfx_ras gfx_v9_4_ras;
> >
> >  #endif /* __GFX_V9_4_H__ */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> > index 54306fd45ff1..2744709fa09d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> > @@ -1644,14 +1644,14 @@ static int
> > gfx_v9_4_2_query_utc_edc_count(struct
> > amdgpu_device *adev,
> >  	return 0;
> >  }
> >
> > -static int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device 
> > *adev,
> > +static void gfx_v9_4_2_query_ras_error_count(struct amdgpu_device 
> > +*adev,
> >  					    void *ras_error_status)
> >  {
> >  	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
> >  	uint32_t sec_count = 0, ded_count = 0;
> >
> >  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
> > -		return -EINVAL;
> > +		return;
> >
> >  	err_data->ue_count = 0;
> >  	err_data->ce_count = 0;
> > @@ -1664,7 +1664,6 @@ static int
> > gfx_v9_4_2_query_ras_error_count(struct
> > amdgpu_device *adev,
> >  	err_data->ce_count += sec_count;
> >  	err_data->ue_count += ded_count;
> >
> > -	return 0;
> >  }
> >
> >  static void gfx_v9_4_2_reset_utc_err_status(struct amdgpu_device
> > *adev) @@
> > -1934,13 +1933,34 @@ static void
> > gfx_v9_4_2_reset_sq_timeout_status(struct
> > amdgpu_device *adev)
> >  	mutex_unlock(&adev->grbm_idx_mutex);
> >  }
> >
> > -const struct amdgpu_gfx_ras_funcs gfx_v9_4_2_ras_funcs = {
> > -	.ras_late_init = amdgpu_gfx_ras_late_init,
> > -	.ras_fini = amdgpu_gfx_ras_fini,
> > -	.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
> > -	.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
> > -	.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
> > -	.query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
> > -	.reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status,
> > +static int gfx_v9_4_2_ras_block_match(struct 
> > +amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index) {
> > +	if(!block_obj)
> > +		return -EINVAL;
> > +
> > +	if(block_obj->block == block) {
> > +		return 0;
> > +	}
> > +
> > +	return -EINVAL;
> > +}
> > +
> > +struct amdgpu_ras_block_ops  gfx_v9_4_2_ras_ops ={
> > +		.ras_block_match = gfx_v9_4_2_ras_block_match,
> > +		.ras_late_init = amdgpu_gfx_ras_late_init,
> > +		.ras_fini = amdgpu_gfx_ras_fini,
> > +		.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
> > +		.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
> > +		.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
> > +		.query_ras_error_status =
> > &gfx_v9_4_2_query_ras_error_status,
> > +		.reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status,
> > +};
> > +
> > +struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
> > +	.ras_block = {
> > +		.name = "gfx",
> > +		.block = AMDGPU_RAS_BLOCK__GFX,
> > +		.ops = &gfx_v9_4_2_ras_ops,
> > +	},
> >  	.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
> >  };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> > index 6db1f88509af..7584624b641c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> > @@ -31,6 +31,6 @@ void gfx_v9_4_2_init_golden_registers(struct
> > amdgpu_device *adev,  void 
> > gfx_v9_4_2_set_power_brake_sequence(struct
> > amdgpu_device *adev);  int gfx_v9_4_2_do_edc_gpr_workarounds(struct
> > amdgpu_device *adev);
> >
> > -extern const struct amdgpu_gfx_ras_funcs gfx_v9_4_2_ras_funcs;
> > +extern struct amdgpu_gfx_ras gfx_v9_4_2_ras;
> >
> >  #endif /* __GFX_V9_4_2_H__ */
> > --
> > 2.25.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH V2 11/11] drm/amdgpu: Move error inject function from amdgpu_ras.c to each block
  2021-12-06  7:33   ` Zhou1, Tao
@ 2021-12-07  7:18     ` Chai, Thomas
  0 siblings, 0 replies; 20+ messages in thread
From: Chai, Thomas @ 2021-12-07  7:18 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx; +Cc: Zhang, Hawking

I can add a default error injection function in amdgpuras_c,  if some block don't define special  . ras_error_inject function, it will use default error injection in amdgpuras_c.

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com> 
Sent: Monday, December 6, 2021 3:34 PM
To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: RE: [PATCH V2 11/11] drm/amdgpu: Move error inject function from amdgpu_ras.c to each block

[AMD Official Use Only]

The error injection has no difference among RAS blocks except GFX and XGMI.
I agree to move the xgmi error injection to amdgpu_xgmi.c, but I don't think it's necessary to implement specific error injection functions for all other RAS blocks.

Regards,
Tao

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Wednesday, December 1, 2021 6:53 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking 
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Chai, Thomas 
> <YiPeng.Chai@amd.com>
> Subject: [PATCH V2 11/11] drm/amdgpu: Move error inject function from 
> amdgpu_ras.c to each block
> 
> Move each block error inject function from amdgpu_ras.c to each block.
> 
> Signed-off-by: yipechai <YiPeng.Chai@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 62 
> +++++------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 28 +++++++++++
>  drivers/gpu/drm/amd/amdgpu/mca_v3_0.c    | 18 +++++++
>  drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c  | 16 ++++++ 
> drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c  | 16 ++++++ 
> drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c  | 16 ++++++
>  drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c   | 16 ++++++
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 16 ++++++
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c   | 16 ++++++
>  drivers/gpu/drm/amd/amdgpu/umc_v6_1.c    | 16 ++++++
>  drivers/gpu/drm/amd/amdgpu/umc_v6_7.c    | 16 ++++++
>  drivers/gpu/drm/amd/amdgpu/umc_v8_7.c    | 16 ++++++
>  12 files changed, 201 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 2e38bd3d3d45..87b625d305c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1032,31 +1032,7 @@ int amdgpu_ras_reset_error_status(struct
> amdgpu_device *adev,
>  	return 0;
>  }
> 
> -/* Trigger XGMI/WAFL error */
> -static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
> -				 struct ta_ras_trigger_error_input *block_info)
> -{
> -	int ret;
> -
> -	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
> -		dev_warn(adev->dev, "Failed to disallow df cstate");
> 
> -	if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
> -		dev_warn(adev->dev, "Failed to disallow XGMI power down");
> -
> -	ret = psp_ras_trigger_error(&adev->psp, block_info);
> -
> -	if (amdgpu_ras_intr_triggered())
> -		return ret;
> -
> -	if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
> -		dev_warn(adev->dev, "Failed to allow XGMI power down");
> -
> -	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
> -		dev_warn(adev->dev, "Failed to allow df cstate");
> -
> -	return ret;
> -}
> 
>  /* wrapper of psp_ras_trigger_error */  int 
> amdgpu_ras_error_inject(struct amdgpu_device *adev, @@ -1076,41
> +1052,25 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
>  	if (!obj)
>  		return -EINVAL;
> 
> +	if (!block_obj || !block_obj->ops)	{
> +		dev_info(adev->dev, "%s don't config ras function \n",
> get_ras_block_str(&info->head));
> +		return -EINVAL;
> +	}
> +
>  	/* Calculate XGMI relative offset */
>  	if (adev->gmc.xgmi.num_physical_nodes > 1) {
> -		block_info.address =
> -			amdgpu_xgmi_get_relative_phy_addr(adev,
> -							  block_info.address);
> +		block_info.address =
> amdgpu_xgmi_get_relative_phy_addr(adev,
> +block_info.address);
>  	}
> 
> -	switch (info->head.block) {
> -	case AMDGPU_RAS_BLOCK__GFX:
> -		if (!block_obj || !block_obj->ops)	{
> -			dev_info(adev->dev, "%s don't config ras function \n",
> get_ras_block_str(&info->head));
> -			return -EINVAL;
> -		}
> -		if (block_obj->ops->ras_error_inject)
> +	if (block_obj->ops->ras_error_inject) {
> +		if(info->head.block == AMDGPU_RAS_BLOCK__GFX)
>  			ret = block_obj->ops->ras_error_inject(adev, info);
> -		break;
> -	case AMDGPU_RAS_BLOCK__UMC:
> -	case AMDGPU_RAS_BLOCK__SDMA:
> -	case AMDGPU_RAS_BLOCK__MMHUB:
> -	case AMDGPU_RAS_BLOCK__PCIE_BIF:
> -	case AMDGPU_RAS_BLOCK__MCA:
> -		ret = psp_ras_trigger_error(&adev->psp, &block_info);
> -		break;
> -	case AMDGPU_RAS_BLOCK__XGMI_WAFL:
> -		ret = amdgpu_ras_error_inject_xgmi(adev, &block_info);
> -		break;
> -	default:
> -		dev_info(adev->dev, "%s error injection is not supported yet\n",
> -			 get_ras_block_str(&info->head));
> -		ret = -EINVAL;
> +		else
> +			ret = block_obj->ops->ras_error_inject(adev,
> &block_info);
>  	}
> 
>  	if (ret)
> -		dev_err(adev->dev, "ras inject %s failed %d\n",
> -			get_ras_block_str(&info->head), ret);
> +		dev_err(adev->dev, "ras inject %s failed %d\n", 
> +get_ras_block_str(&info->head), ret);
> 
>  	return ret;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index da541c7b1ec2..298742afba99 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -940,6 +940,33 @@ static void
> amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
>  	err_data->ce_count += ce_cnt;
>  }
> 
> +/* Trigger XGMI/WAFL error */
> +static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
> +                                void *inject_if) {
> +       int ret = 0;;
> +       struct ta_ras_trigger_error_input *block_info =  (struct 
> +ta_ras_trigger_error_input *)inject_if;
> +
> +       if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
> +               dev_warn(adev->dev, "Failed to disallow df cstate");
> +
> +       if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
> +               dev_warn(adev->dev, "Failed to disallow XGMI power 
> + down");
> +
> +       ret = psp_ras_trigger_error(&adev->psp, block_info);
> +
> +       if (amdgpu_ras_intr_triggered())
> +               return ret;
> +
> +       if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
> +               dev_warn(adev->dev, "Failed to allow XGMI power 
> + down");
> +
> +       if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
> +               dev_warn(adev->dev, "Failed to allow df cstate");
> +
> +       return ret;
> +}
> +
>  static int amdgpu_xgmi_ras_block_match(struct 
> amdgpu_ras_block_object* block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index)  {
>  	if(!block_obj)
> @@ -958,6 +985,7 @@ struct amdgpu_ras_block_ops  xgmi_ras_ops = {
>  	.ras_fini = amdgpu_xgmi_ras_fini,
>  	.query_ras_error_count = amdgpu_xgmi_query_ras_error_count,
>  	.reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count,
> +	.ras_error_inject = amdgpu_ras_error_inject_xgmi,
>  };
> 
>  struct amdgpu_xgmi_ras xgmi_ras = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> index 99edc75ed4ec..ce6841967b05 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> @@ -60,12 +60,28 @@ static int mca_v3_0_ras_block_match(struct
> amdgpu_ras_block_object* block_obj, e
>  	return -EINVAL;
>  }
> 
> +static int mca_v3_0_ras_error_inject(struct amdgpu_device *adev, void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  const struct amdgpu_ras_block_ops mca_v3_0_mp0_ops = {
>  	.ras_block_match = mca_v3_0_ras_block_match,
>  	.ras_late_init = mca_v3_0_mp0_ras_late_init,
>  	.ras_fini = mca_v3_0_mp0_ras_fini,
>  	.query_ras_error_count = mca_v3_0_mp0_query_ras_error_count,
>  	.query_ras_error_address = NULL,
> +	.ras_error_inject = mca_v3_0_ras_error_inject,
>  };
> 
>  struct amdgpu_mca_ras_block mca_v3_0_mp0_ras = { @@ -101,6 +117,7 @@ 
> const struct amdgpu_ras_block_ops mca_v3_0_mp1_ops = {
>  	.ras_fini = mca_v3_0_mp1_ras_fini,
>  	.query_ras_error_count = mca_v3_0_mp1_query_ras_error_count,
>  	.query_ras_error_address = NULL,
> +	.ras_error_inject = mca_v3_0_ras_error_inject,
>  };
> 
>  struct amdgpu_mca_ras_block mca_v3_0_mp1_ras = { @@ -136,6 +153,7 @@ 
> const struct amdgpu_ras_block_ops mca_v3_0_mpio_ops = {
>  	.ras_fini = mca_v3_0_mpio_ras_fini,
>  	.query_ras_error_count = mca_v3_0_mpio_query_ras_error_count,
>  	.query_ras_error_address = NULL,
> +	.ras_error_inject = mca_v3_0_ras_error_inject,
>  };
> 
>  struct amdgpu_mca_ras_block mca_v3_0_mpio_ras = { diff --git 
> a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> index da505314802a..7cca86c504e6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> @@ -786,12 +786,28 @@ static int mmhub_v1_0_ras_block_match(struct
> amdgpu_ras_block_object* block_obj,
>  	return -EINVAL;
>  }
> 
> +static int mmhub_v1_0_ras_error_inject(struct amdgpu_device *adev, 
> +void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  struct amdgpu_ras_block_ops mmhub_v1_0_ras_ops = {
>  	.ras_block_match = mmhub_v1_0_ras_block_match,
>  	.ras_late_init = amdgpu_mmhub_ras_late_init,
>  	.ras_fini = amdgpu_mmhub_ras_fini,
>  	.query_ras_error_count = mmhub_v1_0_query_ras_error_count,
>  	.reset_ras_error_count = mmhub_v1_0_reset_ras_error_count,
> +	.ras_error_inject = mmhub_v1_0_ras_error_inject,
>  };
> 
>  struct amdgpu_mmhub_ras mmhub_v1_0_ras = { diff --git 
> a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
> b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
> index 829d14ee87d3..79a9995caef1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
> @@ -1333,6 +1333,21 @@ static int mmhub_v1_7_ras_block_match(struct
> amdgpu_ras_block_object* block_obj,
>  	return -EINVAL;
>  }
> 
> +static int mmhub_v1_7_ras_error_inject(struct amdgpu_device *adev, 
> +void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  struct amdgpu_ras_block_ops mmhub_v1_7_ras_ops = {
>  	.ras_block_match = mmhub_v1_7_ras_block_match,
>  	.ras_late_init = amdgpu_mmhub_ras_late_init, @@ -1341,6 +1356,7 @@ 
> struct amdgpu_ras_block_ops mmhub_v1_7_ras_ops = {
>  	.reset_ras_error_count = mmhub_v1_7_reset_ras_error_count,
>  	.query_ras_error_status = mmhub_v1_7_query_ras_error_status,
>  	.reset_ras_error_status = mmhub_v1_7_reset_ras_error_status,
> +	.ras_error_inject = mmhub_v1_7_ras_error_inject,
>  };
> 
>  struct amdgpu_mmhub_ras mmhub_v1_7_ras = { diff --git 
> a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
> b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
> index 1edc98e5bcbb..eaed556b9551 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
> @@ -1667,6 +1667,21 @@ static int mmhub_v9_4_ras_block_match(struct
> amdgpu_ras_block_object* block_obj,
>  	return -EINVAL;
>  }
> 
> +static int mmhub_v9_4_ras_error_inject(struct amdgpu_device *adev, 
> +void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  const struct amdgpu_ras_block_ops mmhub_v9_4_ras_ops = {
>  	.ras_block_match = mmhub_v9_4_ras_block_match,
>  	.ras_late_init = amdgpu_mmhub_ras_late_init, @@ -1674,6 +1689,7 @@ 
> const struct amdgpu_ras_block_ops mmhub_v9_4_ras_ops = {
>  	.query_ras_error_count = mmhub_v9_4_query_ras_error_count,
>  	.reset_ras_error_count = mmhub_v9_4_reset_ras_error_count,
>  	.query_ras_error_status = mmhub_v9_4_query_ras_error_status,
> +	.ras_error_inject = mmhub_v9_4_ras_error_inject,
>  };
> 
>  struct amdgpu_mmhub_ras mmhub_v9_4_ras = { diff --git 
> a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> index 14f7265d954e..8e62e2ffabe5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> @@ -650,11 +650,27 @@ static int nbio_v7_4_ras_block_match(struct
> amdgpu_ras_block_object* block_obj,
>  	return -EINVAL;
>  }
> 
> +static int nbio_v7_4_ras_error_inject(struct amdgpu_device *adev, 
> +void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  const struct amdgpu_ras_block_ops nbio_v7_4_ras_ops = {
>  	.ras_block_match = nbio_v7_4_ras_block_match,
>  	.query_ras_error_count = nbio_v7_4_query_ras_error_count,
>  	.ras_late_init = amdgpu_nbio_ras_late_init,
>  	.ras_fini = amdgpu_nbio_ras_fini,
> +	.ras_error_inject = nbio_v7_4_ras_error_inject,
>  };
> 
>  struct amdgpu_nbio_ras nbio_v7_4_ras = { diff --git 
> a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index 30a651613776..578ee40cc0d1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -2803,11 +2803,27 @@ static int sdma_v4_0_ras_block_match(struct
> amdgpu_ras_block_object* block_obj,
>  	return -EINVAL;
>  }
> 
> +static int sdma_v4_0_ras_error_inject(struct amdgpu_device *adev, 
> +void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  const struct amdgpu_ras_block_ops sdma_v4_0_ras_ops = {
>  	.ras_block_match = sdma_v4_0_ras_block_match,
>  	.ras_fini = amdgpu_sdma_ras_fini,
>  	.query_ras_error_count = sdma_v4_0_query_ras_error_count,
>  	.reset_ras_error_count = sdma_v4_0_reset_ras_error_count,
> +	.ras_error_inject = sdma_v4_0_ras_error_inject,
>  };
> 
>  static struct amdgpu_sdma_ras sdma_v4_0_ras = { diff --git 
> a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c
> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c
> index 8c165bcb0ffa..0656c6a7a2c1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c
> @@ -270,11 +270,27 @@ static int sdma_v4_4_ras_block_match(struct
> amdgpu_ras_block_object* block_obj,
>  	return -EINVAL;
>  }
> 
> +static int sdma_v4_4_ras_error_inject(struct amdgpu_device *adev, 
> +void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  const struct amdgpu_ras_block_ops sdma_v4_4_ras_ops = {
>  	.ras_block_match = sdma_v4_4_ras_block_match,
>  	.ras_fini = amdgpu_sdma_ras_fini,
>  	.query_ras_error_count = sdma_v4_4_query_ras_error_count,
>  	.reset_ras_error_count = sdma_v4_4_reset_ras_error_count,
> +	.ras_error_inject = sdma_v4_4_ras_error_inject,
>  };
> 
>  struct amdgpu_sdma_ras sdma_v4_4_ras = { diff --git 
> a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
> index ed480c2081a6..2058439b02cd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
> @@ -477,12 +477,28 @@ static int umc_v6_1_ras_block_match(struct
> amdgpu_ras_block_object* block_obj, e
>  	return -EINVAL;
>  }
> 
> +static int umc_v6_1_ras_error_inject(struct amdgpu_device *adev, void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  const struct amdgpu_ras_block_ops umc_v6_1_ras_ops = {
>  	.ras_block_match = umc_v6_1_ras_block_match,
>  	.ras_late_init = amdgpu_umc_ras_late_init,
>  	.ras_fini = amdgpu_umc_ras_fini,
>  	.query_ras_error_count = umc_v6_1_query_ras_error_count,
>  	.query_ras_error_address = umc_v6_1_query_ras_error_address,
> +	.ras_error_inject = umc_v6_1_ras_error_inject,
>  };
> 
>  struct amdgpu_umc_ras umc_v6_1_ras = { diff --git 
> a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> index e26728dbc6e9..2e87e7de4a55 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> @@ -333,6 +333,21 @@ static int umc_v6_7_ras_block_match(struct
> amdgpu_ras_block_object* block_obj, e
>  	return -EINVAL;
>  }
> 
> +static int umc_v6_7_ras_error_inject(struct amdgpu_device *adev, void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  const struct amdgpu_ras_block_ops umc_v6_7_ras_pos = {
>  	.ras_block_match = umc_v6_7_ras_block_match,
>  	.ras_late_init = amdgpu_umc_ras_late_init, @@ -340,6 +355,7 @@ const 
> struct amdgpu_ras_block_ops umc_v6_7_ras_pos = {
>  	.query_ras_error_count = umc_v6_7_query_ras_error_count,
>  	.query_ras_error_address = umc_v6_7_query_ras_error_address,
>  	.query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
> +	.ras_error_inject = umc_v6_7_ras_error_inject,
>  };
> 
>  struct amdgpu_umc_ras umc_v6_7_ras = { diff --git 
> a/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c
> index 037791e90c24..f7fb653434b9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c
> @@ -336,12 +336,28 @@ static int umc_v8_7_ras_block_match(struct
> amdgpu_ras_block_object* block_obj, e
>  	return -EINVAL;
>  }
> 
> +static int umc_v8_7_ras_error_inject(struct amdgpu_device *adev, void
> +*inject_if) {
> +	int ret = 0;
> +	if (!adev || !inject_if) {
> +		dev_err(adev->dev, "%s invaild parameters \n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +	ret = psp_ras_trigger_error(&adev->psp, (struct
> ta_ras_trigger_error_input *)inject_if);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return ret;
> +}
> +
>  const struct amdgpu_ras_block_ops umc_v8_7_ras_ops = {
>  	.ras_block_match = umc_v8_7_ras_block_match,
>  	.ras_late_init = amdgpu_umc_ras_late_init,
>  	.ras_fini = amdgpu_umc_ras_fini,
>  	.query_ras_error_count = umc_v8_7_query_ras_error_count,
>  	.query_ras_error_address = umc_v8_7_query_ras_error_address,
> +	.ras_error_inject = umc_v8_7_ras_error_inject,
>  };
> 
>  struct amdgpu_umc_ras umc_v8_7_ras = {
> --
> 2.25.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2021-12-07  7:18 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-01 10:52 [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras block yipechai
2021-12-01 10:52 ` [PATCH V2 02/11] drm/amdgpu: Modify the compilation failed problem when other ras blocks' .h include amdgpu_ras.h yipechai
2021-12-06  6:56   ` Zhou1, Tao
2021-12-07  2:31     ` Chai, Thomas
2021-12-01 10:52 ` [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the unified ras block data and ops yipechai
2021-12-06  6:58   ` Zhou1, Tao
2021-12-07  3:37     ` Chai, Thomas
2021-12-07  4:06       ` Zhou1, Tao
2021-12-07  6:31         ` Chai, Thomas
2021-12-01 10:52 ` [PATCH V2 04/11] drm/amdgpu: Modify gmc " yipechai
2021-12-01 10:52 ` [PATCH V2 05/11] drm/amdgpu: Modify hdp " yipechai
2021-12-01 10:52 ` [PATCH V2 06/11] drm/amdgpu: Modify mmhub " yipechai
2021-12-01 10:52 ` [PATCH V2 07/11] drm/amdgpu: Modify nbio " yipechai
2021-12-01 10:52 ` [PATCH V2 08/11] drm/amdgpu: Modify umc " yipechai
2021-12-01 10:52 ` [PATCH V2 09/11] drm/amdgpu: Modify sdma " yipechai
2021-12-01 10:52 ` [PATCH V2 10/11] drm/amdgpu: Modify mca " yipechai
2021-12-01 10:52 ` [PATCH V2 11/11] drm/amdgpu: Move error inject function from amdgpu_ras.c to each block yipechai
2021-12-06  7:33   ` Zhou1, Tao
2021-12-07  7:18     ` Chai, Thomas
2021-12-06  7:36 ` [PATCH V2 01/11] drm/amdgpu: Unify ras block interface for each ras block Zhou1, Tao

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.