All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/7] drm/amdgpu: add helper function to do common ras_late_init
@ 2019-08-28 13:03 Hawking Zhang
       [not found] ` <1566997395-7185-1-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Hawking Zhang @ 2019-08-28 13:03 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Tao Zhou, Alex Deucher
  Cc: Hawking Zhang

In late_init for ras, the helper function will be used to
1). disable ras feature if the IP block is masked as disabled
2). send enable feature command if the ip block was masked as enabled
3). create debugfs/sysfs node per IP block
4). register interrupt handler

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 57 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  4 +++
 2 files changed, 61 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 230f7e6..2c32f99 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1564,6 +1564,63 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 	return -EINVAL;
 }
 
+/* helper function to handle common stuff in ip late init phase */
+int amdgpu_ras_late_init(struct amdgpu_device *adev,
+			 struct ras_common_if *ras_block,
+			 struct ras_fs_if *fs_info,
+			 struct ras_ih_if *ih_info)
+{
+	int r;
+
+	/* disable RAS feature per IP block if it is not supported */
+	if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
+		amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
+		return 0;
+	}
+
+	r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
+	if (r) {
+		if (r == -EAGAIN) {
+			/* request gpu reset. will run again */
+			amdgpu_ras_request_reset_on_boot(adev,
+					ras_block->block);
+			return 0;
+		}
+		/* in resume phase, if fail to enable ras,
+		 * clean up all ras fs nodes, and disable ras */
+		if (adev->in_suspend)
+			goto cleanup;
+	}
+
+	/* in resume phase, no need to create ras fs node */
+	if (adev->in_suspend)
+		return 0;
+
+	if (ras_block->block == AMDGPU_RAS_BLOCK__UMC ||
+	    ras_block->block == AMDGPU_RAS_BLOCK__SDMA ||
+	    ras_block->block == AMDGPU_RAS_BLOCK__GFX) {
+		r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
+		if (r)
+			goto interrupt;
+	}
+
+	amdgpu_ras_debugfs_create(adev, fs_info);
+
+	r = amdgpu_ras_sysfs_create(adev, fs_info);
+	if (r)
+		goto sysfs;
+
+	return 0;
+cleanup:
+	amdgpu_ras_sysfs_remove(adev, ras_block);
+sysfs:
+	amdgpu_ras_debugfs_remove(adev, ras_block);
+	amdgpu_ras_interrupt_remove_handler(adev, ih_info);
+interrupt:
+	amdgpu_ras_feature_enable(adev, ras_block, 0);
+	return r;
+}
+
 /* do some init work after IP late init as dependence.
  * and it runs in resume/gpu reset/booting up cases.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 6c76bb2..5212961 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -566,6 +566,10 @@ amdgpu_ras_error_to_ta(enum amdgpu_ras_error_type error) {
 int amdgpu_ras_init(struct amdgpu_device *adev);
 int amdgpu_ras_fini(struct amdgpu_device *adev);
 int amdgpu_ras_pre_fini(struct amdgpu_device *adev);
+int amdgpu_ras_late_init(struct amdgpu_device *adev,
+			 struct ras_common_if *ras_block,
+			 struct ras_fs_if *fs_info,
+			 struct ras_ih_if *ih_info);
 
 int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 		struct ras_common_if *head, bool enable);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/7] drm/amdgpu: switch to amdgpu_ras_late_init for gfx v9 block
       [not found] ` <1566997395-7185-1-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
@ 2019-08-28 13:03   ` Hawking Zhang
       [not found]     ` <1566997395-7185-2-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
  2019-08-28 13:03   ` [PATCH 3/7] drm/amdgpu: switch to ras_late_init for sdma v4 block Hawking Zhang
                     ` (5 subsequent siblings)
  6 siblings, 1 reply; 18+ messages in thread
From: Hawking Zhang @ 2019-08-28 13:03 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Tao Zhou, Alex Deucher
  Cc: Hawking Zhang

call helper function in late init phase to handle ras init
for gfx ip block

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 90 ++++++++---------------------------
 1 file changed, 19 insertions(+), 71 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index a6bcbde..62ec451 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4389,7 +4389,6 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
 static int gfx_v9_0_ecc_late_init(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-	struct ras_common_if **ras_if = &adev->gfx.ras_if;
 	struct ras_ih_if ih_info = {
 		.cb = gfx_v9_0_process_ras_data_cb,
 	};
@@ -4397,18 +4396,18 @@ static int gfx_v9_0_ecc_late_init(void *handle)
 		.sysfs_name = "gfx_err_count",
 		.debugfs_name = "gfx_err_inject",
 	};
-	struct ras_common_if ras_block = {
-		.block = AMDGPU_RAS_BLOCK__GFX,
-		.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
-		.sub_block_index = 0,
-		.name = "gfx",
-	};
 	int r;
 
-	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
-		amdgpu_ras_feature_enable_on_boot(adev, &ras_block, 0);
-		return 0;
+	if (!adev->gfx.ras_if) {
+		adev->gfx.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
+		if (!adev->gfx.ras_if)
+			return -ENOMEM;
+		adev->gfx.ras_if->block = AMDGPU_RAS_BLOCK__GFX;
+		adev->gfx.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+		adev->gfx.ras_if->sub_block_index = 0;
+		strcpy(adev->gfx.ras_if->name, "gfx");
 	}
+	fs_info.head = ih_info.head = *adev->gfx.ras_if;
 
 	r = gfx_v9_0_do_edc_gds_workarounds(adev);
 	if (r)
@@ -4419,71 +4418,20 @@ static int gfx_v9_0_ecc_late_init(void *handle)
 	if (r)
 		return r;
 
-	/* handle resume path. */
-	if (*ras_if) {
-		/* resend ras TA enable cmd during resume.
-		 * prepare to handle failure.
-		 */
-		ih_info.head = **ras_if;
-		r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1);
-		if (r) {
-			if (r == -EAGAIN) {
-				/* request a gpu reset. will run again. */
-				amdgpu_ras_request_reset_on_boot(adev,
-						AMDGPU_RAS_BLOCK__GFX);
-				return 0;
-			}
-			/* fail to enable ras, cleanup all. */
-			goto irq;
-		}
-		/* enable successfully. continue. */
-		goto resume;
-	}
-
-	*ras_if = kmalloc(sizeof(**ras_if), GFP_KERNEL);
-	if (!*ras_if)
-		return -ENOMEM;
-
-	**ras_if = ras_block;
-
-	r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1);
-	if (r) {
-		if (r == -EAGAIN) {
-			amdgpu_ras_request_reset_on_boot(adev,
-					AMDGPU_RAS_BLOCK__GFX);
-			r = 0;
-		}
-		goto feature;
-	}
-
-	ih_info.head = **ras_if;
-	fs_info.head = **ras_if;
-
-	r = amdgpu_ras_interrupt_add_handler(adev, &ih_info);
+	r = amdgpu_ras_late_init(adev, adev->gfx.ras_if,
+				 &fs_info, &ih_info);
 	if (r)
-		goto interrupt;
+		goto free;
 
-	amdgpu_ras_debugfs_create(adev, &fs_info);
-
-	r = amdgpu_ras_sysfs_create(adev, &fs_info);
-	if (r)
-		goto sysfs;
-resume:
-	r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
-	if (r)
-		goto irq;
+	if (amdgpu_ras_is_supported(adev, adev->gfx.ras_if->block)) {
+		r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
+		if (r)
+			goto free;
+	}
 
 	return 0;
-irq:
-	amdgpu_ras_sysfs_remove(adev, *ras_if);
-sysfs:
-	amdgpu_ras_debugfs_remove(adev, *ras_if);
-	amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
-interrupt:
-	amdgpu_ras_feature_enable(adev, *ras_if, 0);
-feature:
-	kfree(*ras_if);
-	*ras_if = NULL;
+free:
+	kfree(adev->gfx.ras_if);
 	return r;
 }
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/7] drm/amdgpu: switch to ras_late_init for sdma v4 block
       [not found] ` <1566997395-7185-1-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
  2019-08-28 13:03   ` [PATCH 2/7] drm/amdgpu: switch to amdgpu_ras_late_init for gfx v9 block Hawking Zhang
@ 2019-08-28 13:03   ` Hawking Zhang
  2019-08-28 13:03   ` [PATCH 4/7] drm/amdgpu: switch to ras_late_init for gmc v9 Hawking Zhang
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 18+ messages in thread
From: Hawking Zhang @ 2019-08-28 13:03 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Tao Zhou, Alex Deucher
  Cc: Hawking Zhang

call ras_late_init helper function to do ras init for sdma block

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 96 ++++++++--------------------------
 1 file changed, 22 insertions(+), 74 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 256d381..c3486c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1692,7 +1692,6 @@ static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
 static int sdma_v4_0_late_init(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-	struct ras_common_if **ras_if = &adev->sdma.ras_if;
 	struct ras_ih_if ih_info = {
 		.cb = sdma_v4_0_process_ras_data_cb,
 	};
@@ -1700,87 +1699,36 @@ static int sdma_v4_0_late_init(void *handle)
 		.sysfs_name = "sdma_err_count",
 		.debugfs_name = "sdma_err_inject",
 	};
-	struct ras_common_if ras_block = {
-		.block = AMDGPU_RAS_BLOCK__SDMA,
-		.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
-		.sub_block_index = 0,
-		.name = "sdma",
-	};
 	int r, i;
 
-	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
-		amdgpu_ras_feature_enable_on_boot(adev, &ras_block, 0);
-		return 0;
-	}
-
-	/* handle resume path. */
-	if (*ras_if) {
-		/* resend ras TA enable cmd during resume.
-		 * prepare to handle failure.
-		 */
-		ih_info.head = **ras_if;
-		r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1);
-		if (r) {
-			if (r == -EAGAIN) {
-				/* request a gpu reset. will run again. */
-				amdgpu_ras_request_reset_on_boot(adev,
-						AMDGPU_RAS_BLOCK__SDMA);
-				return 0;
-			}
-			/* fail to enable ras, cleanup all. */
-			goto irq;
-		}
-		/* enable successfully. continue. */
-		goto resume;
-	}
-
-	*ras_if = kmalloc(sizeof(**ras_if), GFP_KERNEL);
-	if (!*ras_if)
-		return -ENOMEM;
-
-	**ras_if = ras_block;
-
-	r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1);
-	if (r) {
-		if (r == -EAGAIN) {
-			amdgpu_ras_request_reset_on_boot(adev,
-					AMDGPU_RAS_BLOCK__SDMA);
-			r = 0;
-		}
-		goto feature;
+	if (!adev->sdma.ras_if) {
+		adev->sdma.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
+		if (!adev->sdma.ras_if)
+			return -ENOMEM;
+		adev->sdma.ras_if->block = AMDGPU_RAS_BLOCK__SDMA;
+		adev->sdma.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+		adev->sdma.ras_if->sub_block_index = 0;
+		strcpy(adev->sdma.ras_if->name, "sdma");
 	}
+	fs_info.head = ih_info.head = *adev->sdma.ras_if;
 
-	ih_info.head = **ras_if;
-	fs_info.head = **ras_if;
-
-	r = amdgpu_ras_interrupt_add_handler(adev, &ih_info);
+	r = amdgpu_ras_late_init(adev, adev->sdma.ras_if,
+				 &fs_info, &ih_info);
 	if (r)
-		goto interrupt;
-
-	amdgpu_ras_debugfs_create(adev, &fs_info);
+		goto free;
 
-	r = amdgpu_ras_sysfs_create(adev, &fs_info);
-	if (r)
-		goto sysfs;
-resume:
-	for (i = 0; i < adev->sdma.num_instances; i++) {
-		r = amdgpu_irq_get(adev, &adev->sdma.ecc_irq,
-				   AMDGPU_SDMA_IRQ_INSTANCE0 + i);
-		if (r)
-			goto irq;
+	if (amdgpu_ras_is_supported(adev, adev->sdma.ras_if->block)) {
+		for (i = 0; i < adev->sdma.num_instances; i++) {
+			r = amdgpu_irq_get(adev, &adev->sdma.ecc_irq,
+				AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+			if (r)
+				goto free;
+		}
 	}
 
-	return 0;
-irq:
-	amdgpu_ras_sysfs_remove(adev, *ras_if);
-sysfs:
-	amdgpu_ras_debugfs_remove(adev, *ras_if);
-	amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
-interrupt:
-	amdgpu_ras_feature_enable(adev, *ras_if, 0);
-feature:
-	kfree(*ras_if);
-	*ras_if = NULL;
+        return 0;
+free:
+	kfree(adev->sdma.ras_if);
 	return r;
 }
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 4/7] drm/amdgpu: switch to ras_late_init for gmc v9
       [not found] ` <1566997395-7185-1-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
  2019-08-28 13:03   ` [PATCH 2/7] drm/amdgpu: switch to amdgpu_ras_late_init for gfx v9 block Hawking Zhang
  2019-08-28 13:03   ` [PATCH 3/7] drm/amdgpu: switch to ras_late_init for sdma v4 block Hawking Zhang
@ 2019-08-28 13:03   ` Hawking Zhang
  2019-08-28 13:03   ` [PATCH 5/7] drm/amdgpu: add mmhub ras_late_init callback function Hawking Zhang
                     ` (3 subsequent siblings)
  6 siblings, 0 replies; 18+ messages in thread
From: Hawking Zhang @ 2019-08-28 13:03 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Tao Zhou, Alex Deucher
  Cc: Hawking Zhang

call amdgpu_ras_late_init to do ras init for gmc v9 block

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 155 ++++++++++------------------------
 1 file changed, 43 insertions(+), 112 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index e97c3c8..8dc13d2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -758,133 +758,64 @@ static int gmc_v9_0_allocate_vm_inv_eng(struct amdgpu_device *adev)
 	return 0;
 }
 
-static int gmc_v9_0_ecc_ras_block_late_init(void *handle,
-			struct ras_fs_if *fs_info, struct ras_common_if *ras_block)
-{
-	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-	struct ras_common_if **ras_if = NULL;
-	struct ras_ih_if ih_info = {
-		.cb = gmc_v9_0_process_ras_data_cb,
-	};
-	int r;
-
-	if (ras_block->block == AMDGPU_RAS_BLOCK__UMC)
-		ras_if = &adev->gmc.umc_ras_if;
-	else if (ras_block->block == AMDGPU_RAS_BLOCK__MMHUB)
-		ras_if = &adev->gmc.mmhub_ras_if;
-	else
-		BUG();
-
-	if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
-		amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
-		return 0;
-	}
-
-	/* handle resume path. */
-	if (*ras_if) {
-		/* resend ras TA enable cmd during resume.
-		 * prepare to handle failure.
-		 */
-		ih_info.head = **ras_if;
-		r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1);
-		if (r) {
-			if (r == -EAGAIN) {
-				/* request a gpu reset. will run again. */
-				amdgpu_ras_request_reset_on_boot(adev,
-						ras_block->block);
-				return 0;
-			}
-			/* fail to enable ras, cleanup all. */
-			goto irq;
-		}
-		/* enable successfully. continue. */
-		goto resume;
-	}
-
-	*ras_if = kmalloc(sizeof(**ras_if), GFP_KERNEL);
-	if (!*ras_if)
-		return -ENOMEM;
-
-	**ras_if = *ras_block;
-
-	r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1);
-	if (r) {
-		if (r == -EAGAIN) {
-			amdgpu_ras_request_reset_on_boot(adev,
-					ras_block->block);
-			r = 0;
-		}
-		goto feature;
-	}
-
-	ih_info.head = **ras_if;
-	fs_info->head = **ras_if;
-
-	if (ras_block->block == AMDGPU_RAS_BLOCK__UMC) {
-		r = amdgpu_ras_interrupt_add_handler(adev, &ih_info);
-		if (r)
-			goto interrupt;
-	}
-
-	amdgpu_ras_debugfs_create(adev, fs_info);
-
-	r = amdgpu_ras_sysfs_create(adev, fs_info);
-	if (r)
-		goto sysfs;
-resume:
-	if (ras_block->block == AMDGPU_RAS_BLOCK__UMC) {
-		r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
-		if (r)
-			goto irq;
-	}
-
-	return 0;
-irq:
-	amdgpu_ras_sysfs_remove(adev, *ras_if);
-sysfs:
-	amdgpu_ras_debugfs_remove(adev, *ras_if);
-	if (ras_block->block == AMDGPU_RAS_BLOCK__UMC)
-		amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
-interrupt:
-	amdgpu_ras_feature_enable(adev, *ras_if, 0);
-feature:
-	kfree(*ras_if);
-	*ras_if = NULL;
-	return r;
-}
-
 static int gmc_v9_0_ecc_late_init(void *handle)
 {
 	int r;
-
+	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+	struct ras_ih_if mmhub_ih_info;
 	struct ras_fs_if umc_fs_info = {
 		.sysfs_name = "umc_err_count",
 		.debugfs_name = "umc_err_inject",
 	};
-	struct ras_common_if umc_ras_block = {
-		.block = AMDGPU_RAS_BLOCK__UMC,
-		.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
-		.sub_block_index = 0,
-		.name = "umc",
+	struct ras_ih_if umc_ih_info = {
+		.cb = gmc_v9_0_process_ras_data_cb,
 	};
 	struct ras_fs_if mmhub_fs_info = {
 		.sysfs_name = "mmhub_err_count",
 		.debugfs_name = "mmhub_err_inject",
 	};
-	struct ras_common_if mmhub_ras_block = {
-		.block = AMDGPU_RAS_BLOCK__MMHUB,
-		.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
-		.sub_block_index = 0,
-		.name = "mmhub",
-	};
 
-	r = gmc_v9_0_ecc_ras_block_late_init(handle,
-			&umc_fs_info, &umc_ras_block);
+	if (!adev->gmc.umc_ras_if) {
+		adev->gmc.umc_ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
+		if (!adev->gmc.umc_ras_if)
+			return -ENOMEM;
+		adev->gmc.umc_ras_if->block = AMDGPU_RAS_BLOCK__UMC;
+		adev->gmc.umc_ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+		adev->gmc.umc_ras_if->sub_block_index = 0;
+		strcpy(adev->gmc.umc_ras_if->name, "umc");
+	}
+	umc_ih_info.head = umc_fs_info.head = *adev->gmc.umc_ras_if;
+
+	r = amdgpu_ras_late_init(adev, adev->gmc.umc_ras_if,
+				 &umc_fs_info, &umc_ih_info);
 	if (r)
-		return r;
+		goto free;
 
-	r = gmc_v9_0_ecc_ras_block_late_init(handle,
-			&mmhub_fs_info, &mmhub_ras_block);
+	if (amdgpu_ras_is_supported(adev, adev->gmc.umc_ras_if->block)) {
+		r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
+		if (r)
+			goto free;
+	}
+
+	if (!adev->gmc.mmhub_ras_if) {
+		adev->gmc.mmhub_ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
+		if (!adev->gmc.mmhub_ras_if)
+			return -ENOMEM;
+		adev->gmc.mmhub_ras_if->block = AMDGPU_RAS_BLOCK__MMHUB;
+		adev->gmc.mmhub_ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+		adev->gmc.mmhub_ras_if->sub_block_index = 0;
+		strcpy(adev->gmc.mmhub_ras_if->name, "mmhub");
+	}
+	mmhub_ih_info.head = mmhub_fs_info.head = *adev->gmc.mmhub_ras_if;
+	r = amdgpu_ras_late_init(adev, adev->gmc.mmhub_ras_if,
+				 &mmhub_fs_info, &mmhub_ih_info);
+	if (r)
+		goto free;
+
+	return 0;
+free:
+	kfree(adev->gmc.umc_ras_if);
+	kfree(adev->gmc.mmhub_ras_if);
 	return r;
 }
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 5/7] drm/amdgpu: add mmhub ras_late_init callback function
       [not found] ` <1566997395-7185-1-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
                     ` (2 preceding siblings ...)
  2019-08-28 13:03   ` [PATCH 4/7] drm/amdgpu: switch to ras_late_init for gmc v9 Hawking Zhang
@ 2019-08-28 13:03   ` Hawking Zhang
       [not found]     ` <1566997395-7185-5-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
  2019-08-28 13:03   ` [PATCH 6/7] drm/amdgpu: add ras_late_init callback function for nbio v7_4 Hawking Zhang
                     ` (2 subsequent siblings)
  6 siblings, 1 reply; 18+ messages in thread
From: Hawking Zhang @ 2019-08-28 13:03 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Tao Zhou, Alex Deucher
  Cc: Hawking Zhang

The function will be called in late init phase to do mmhub
ras init

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c     | 21 ++-------------------
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c   | 28 ++++++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
index 2d75ecf..df04c71 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
@@ -23,6 +23,7 @@
 
 struct amdgpu_mmhub_funcs {
 	void (*ras_init)(struct amdgpu_device *adev);
+	int (*ras_late_init)(struct amdgpu_device *adev);
 	void (*query_ras_error_count)(struct amdgpu_device *adev,
 					void *ras_error_status);
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 8dc13d2..26a6956 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -762,7 +762,6 @@ static int gmc_v9_0_ecc_late_init(void *handle)
 {
 	int r;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-	struct ras_ih_if mmhub_ih_info;
 	struct ras_fs_if umc_fs_info = {
 		.sysfs_name = "umc_err_count",
 		.debugfs_name = "umc_err_inject",
@@ -770,10 +769,6 @@ static int gmc_v9_0_ecc_late_init(void *handle)
 	struct ras_ih_if umc_ih_info = {
 		.cb = gmc_v9_0_process_ras_data_cb,
 	};
-	struct ras_fs_if mmhub_fs_info = {
-		.sysfs_name = "mmhub_err_count",
-		.debugfs_name = "mmhub_err_inject",
-	};
 
 	if (!adev->gmc.umc_ras_if) {
 		adev->gmc.umc_ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
@@ -797,25 +792,13 @@ static int gmc_v9_0_ecc_late_init(void *handle)
 			goto free;
 	}
 
-	if (!adev->gmc.mmhub_ras_if) {
-		adev->gmc.mmhub_ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
-		if (!adev->gmc.mmhub_ras_if)
-			return -ENOMEM;
-		adev->gmc.mmhub_ras_if->block = AMDGPU_RAS_BLOCK__MMHUB;
-		adev->gmc.mmhub_ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
-		adev->gmc.mmhub_ras_if->sub_block_index = 0;
-		strcpy(adev->gmc.mmhub_ras_if->name, "mmhub");
-	}
-	mmhub_ih_info.head = mmhub_fs_info.head = *adev->gmc.mmhub_ras_if;
-	r = amdgpu_ras_late_init(adev, adev->gmc.mmhub_ras_if,
-				 &mmhub_fs_info, &mmhub_ih_info);
+	r = adev->mmhub_funcs->ras_late_init(adev);
 	if (r)
-		goto free;
+		return r;
 
 	return 0;
 free:
 	kfree(adev->gmc.umc_ras_if);
-	kfree(adev->gmc.mmhub_ras_if);
 	return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
index 04cd4b6..9f7d5d1 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
@@ -31,6 +31,7 @@
 #include "vega10_enum.h"
 
 #include "soc15_common.h"
+#include "amdgpu_ras.h"
 
 #define mmDAGB0_CNTL_MISC2_RV 0x008f
 #define mmDAGB0_CNTL_MISC2_RV_BASE_IDX 0
@@ -615,6 +616,33 @@ static void mmhub_v1_0_query_ras_error_count(struct amdgpu_device *adev,
 	}
 }
 
+static int mmhub_v1_0_ras_late_init(struct amdgpu_device *adev)
+{
+	int r;
+	struct ras_ih_if mmhub_ih_info;
+	struct ras_fs_if mmhub_fs_info = {
+		.sysfs_name = "mmhub_err_count",
+		.debugfs_name = "mmhub_err_inject",
+	};
+
+	if (!adev->gmc.mmhub_ras_if) {
+		adev->gmc.mmhub_ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
+		if (!adev->gmc.mmhub_ras_if)
+			return -ENOMEM;
+		adev->gmc.mmhub_ras_if->block = AMDGPU_RAS_BLOCK__MMHUB;
+		adev->gmc.mmhub_ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+		adev->gmc.mmhub_ras_if->sub_block_index = 0;
+		strcpy(adev->gmc.mmhub_ras_if->name, "mmhub");
+	}
+	mmhub_ih_info.head = mmhub_fs_info.head = *adev->gmc.mmhub_ras_if;
+	r = amdgpu_ras_late_init(adev, adev->gmc.mmhub_ras_if,
+				 &mmhub_fs_info, &mmhub_ih_info);
+	if (r)
+		kfree(adev->gmc.mmhub_ras_if);
+	return r;
+}
+
 const struct amdgpu_mmhub_funcs mmhub_v1_0_funcs = {
+	.ras_late_init = mmhub_v1_0_ras_late_init,
 	.query_ras_error_count = mmhub_v1_0_query_ras_error_count,
 };
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 6/7] drm/amdgpu: add ras_late_init callback function for nbio v7_4
       [not found] ` <1566997395-7185-1-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
                     ` (3 preceding siblings ...)
  2019-08-28 13:03   ` [PATCH 5/7] drm/amdgpu: add mmhub ras_late_init callback function Hawking Zhang
@ 2019-08-28 13:03   ` Hawking Zhang
       [not found]     ` <1566997395-7185-6-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
  2019-08-28 13:03   ` [PATCH 7/7] drm/amdgpu: switch to ras_late_init callback " Hawking Zhang
  2019-08-29  2:59   ` [PATCH 1/7] drm/amdgpu: add helper function to do common ras_late_init Zhou1, Tao
  6 siblings, 1 reply; 18+ messages in thread
From: Hawking Zhang @ 2019-08-28 13:03 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Tao Zhou, Alex Deucher
  Cc: Hawking Zhang

ras_late_init callback function will be used to do common ras
init in late init phase.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c   | 41 ++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index a04c5ea..51078da6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -81,12 +81,14 @@ struct amdgpu_nbio_funcs {
 	void (*handle_ras_err_event_athub_intr_no_bifring)(struct amdgpu_device *adev);
 	int (*init_ras_controller_interrupt)(struct amdgpu_device *adev);
 	int (*init_ras_err_event_athub_interrupt)(struct amdgpu_device *adev);
+	int (*ras_late_init)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_nbio {
 	const struct nbio_hdp_flush_reg *hdp_flush_reg;
 	struct amdgpu_irq_src ras_controller_irq;
 	struct amdgpu_irq_src ras_err_event_athub_irq;
+	struct ras_common_if *ras_if;
 	const struct amdgpu_nbio_funcs *funcs;
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index faf9300..367f9d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -23,6 +23,7 @@
 #include "amdgpu.h"
 #include "amdgpu_atombios.h"
 #include "nbio_v7_4.h"
+#include "amdgpu_ras.h"
 
 #include "nbio/nbio_7_4_offset.h"
 #include "nbio/nbio_7_4_sh_mask.h"
@@ -468,6 +469,45 @@ static int nbio_v7_4_init_ras_err_event_athub_interrupt (struct amdgpu_device *a
 	return 0;
 }
 
+static int nbio_v7_4_ras_late_init(struct amdgpu_device *adev)
+{
+	int r;
+	struct ras_ih_if ih_info;
+	struct ras_fs_if fs_info = {
+		.sysfs_name = "nbio_err_count",
+		.debugfs_name = "nbio_err_inject",
+	};
+
+	if (!adev->nbio.ras_if) {
+		adev->nbio.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
+		if (!adev->nbio.ras_if)
+			return -ENOMEM;
+		adev->nbio.ras_if->block = AMDGPU_RAS_BLOCK__PCIE_BIF;
+		adev->nbio.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+		adev->nbio.ras_if->sub_block_index = 0;
+		strcpy(adev->nbio.ras_if->name, "nbio");
+	}
+	ih_info.head = fs_info.head = *adev->nbio.ras_if;
+	r = amdgpu_ras_late_init(adev, adev->nbio.ras_if,
+				 &fs_info, &ih_info);
+	if (r)
+		goto free;
+
+	if (amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
+		r = amdgpu_irq_get(adev, &adev->nbio.ras_controller_irq, 0);
+		if (r)
+			goto free;
+		r = amdgpu_irq_get(adev, &adev->nbio.ras_err_event_athub_irq, 0);
+		if (r)
+			goto free;
+	}
+
+	return 0;
+free:
+	kfree(adev->nbio.ras_if);
+	return r;
+}
+
 const struct amdgpu_nbio_funcs nbio_v7_4_funcs = {
 	.get_hdp_flush_req_offset = nbio_v7_4_get_hdp_flush_req_offset,
 	.get_hdp_flush_done_offset = nbio_v7_4_get_hdp_flush_done_offset,
@@ -493,4 +533,5 @@ const struct amdgpu_nbio_funcs nbio_v7_4_funcs = {
 	.handle_ras_err_event_athub_intr_no_bifring = nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring,
 	.init_ras_controller_interrupt = nbio_v7_4_init_ras_controller_interrupt,
 	.init_ras_err_event_athub_interrupt = nbio_v7_4_init_ras_err_event_athub_interrupt,
+	.ras_late_init = nbio_v7_4_ras_late_init,
 };
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 7/7] drm/amdgpu: switch to ras_late_init callback for nbio v7_4
       [not found] ` <1566997395-7185-1-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
                     ` (4 preceding siblings ...)
  2019-08-28 13:03   ` [PATCH 6/7] drm/amdgpu: add ras_late_init callback function for nbio v7_4 Hawking Zhang
@ 2019-08-28 13:03   ` Hawking Zhang
       [not found]     ` <1566997395-7185-7-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
  2019-08-29  2:59   ` [PATCH 1/7] drm/amdgpu: add helper function to do common ras_late_init Zhou1, Tao
  6 siblings, 1 reply; 18+ messages in thread
From: Hawking Zhang @ 2019-08-28 13:03 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Tao Zhou, Alex Deucher
  Cc: Hawking Zhang

invoke nbio ras_late_init callback function to do nbio ras init

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/soc15.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index e7f2539..f53bd59 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -1206,11 +1206,15 @@ static int soc15_common_early_init(void *handle)
 static int soc15_common_late_init(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+	int r;
 
 	if (amdgpu_sriov_vf(adev))
 		xgpu_ai_mailbox_get_irq(adev);
 
-	return 0;
+	if (adev->nbio.funcs->ras_late_init)
+		r = adev->nbio.funcs->ras_late_init(adev);
+
+	return r;
 }
 
 static int soc15_common_sw_init(void *handle)
@@ -1287,6 +1291,13 @@ static int soc15_common_hw_fini(void *handle)
 	if (amdgpu_sriov_vf(adev))
 		xgpu_ai_mailbox_put_irq(adev);
 
+	if (amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
+		if (adev->nbio.funcs->init_ras_controller_interrupt)
+			amdgpu_irq_put(adev, &adev->nbio.ras_controller_irq, 0);
+		if (adev->nbio.funcs->init_ras_err_event_athub_interrupt)
+			amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0);
+	}
+
 	return 0;
 }
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 7/7] drm/amdgpu: switch to ras_late_init callback for nbio v7_4
       [not found]     ` <1566997395-7185-7-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
@ 2019-08-28 17:29       ` Deucher, Alexander
  2019-08-29  1:25       ` Chen, Guchun
  1 sibling, 0 replies; 18+ messages in thread
From: Deucher, Alexander @ 2019-08-28 17:29 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Zhou1, Tao


[-- Attachment #1.1: Type: text/plain, Size: 2198 bytes --]

Nice clean up.  Series is:
Reviewed-by: Alex Deucher <alexander.deucher-5C7GfCeVMHo@public.gmane.org>
________________________________
From: Hawking Zhang <Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
Sent: Wednesday, August 28, 2019 9:03 AM
To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org <amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>; Zhou1, Tao <Tao.Zhou1-5C7GfCeVMHo@public.gmane.org>; Deucher, Alexander <Alexander.Deucher-5C7GfCeVMHo@public.gmane.org>
Cc: Zhang, Hawking <Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
Subject: [PATCH 7/7] drm/amdgpu: switch to ras_late_init callback for nbio v7_4

invoke nbio ras_late_init callback function to do nbio ras init

Signed-off-by: Hawking Zhang <Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
---
 drivers/gpu/drm/amd/amdgpu/soc15.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index e7f2539..f53bd59 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -1206,11 +1206,15 @@ static int soc15_common_early_init(void *handle)
 static int soc15_common_late_init(void *handle)
 {
         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+       int r;

         if (amdgpu_sriov_vf(adev))
                 xgpu_ai_mailbox_get_irq(adev);

-       return 0;
+       if (adev->nbio.funcs->ras_late_init)
+               r = adev->nbio.funcs->ras_late_init(adev);
+
+       return r;
 }

 static int soc15_common_sw_init(void *handle)
@@ -1287,6 +1291,13 @@ static int soc15_common_hw_fini(void *handle)
         if (amdgpu_sriov_vf(adev))
                 xgpu_ai_mailbox_put_irq(adev);

+       if (amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
+               if (adev->nbio.funcs->init_ras_controller_interrupt)
+                       amdgpu_irq_put(adev, &adev->nbio.ras_controller_irq, 0);
+               if (adev->nbio.funcs->init_ras_err_event_athub_interrupt)
+                       amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0);
+       }
+
         return 0;
 }

--
2.7.4


[-- Attachment #1.2: Type: text/html, Size: 4407 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* RE: [PATCH 7/7] drm/amdgpu: switch to ras_late_init callback for nbio v7_4
       [not found]     ` <1566997395-7185-7-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
  2019-08-28 17:29       ` Deucher, Alexander
@ 2019-08-29  1:25       ` Chen, Guchun
       [not found]         ` <BYAPR12MB2806FE6895D90CD1AC1076D7F1A20-ZGDeBxoHBPk0CuAkIMgl3QdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  1 sibling, 1 reply; 18+ messages in thread
From: Chen, Guchun @ 2019-08-29  1:25 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Zhou1, Tao, Deucher, Alexander
  Cc: Zhang, Hawking


Regards,
Guchun

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Hawking Zhang
Sent: Wednesday, August 28, 2019 9:03 PM
To: amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: [PATCH 7/7] drm/amdgpu: switch to ras_late_init callback for nbio v7_4

invoke nbio ras_late_init callback function to do nbio ras init

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/soc15.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index e7f2539..f53bd59 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -1206,11 +1206,15 @@ static int soc15_common_early_init(void *handle)  static int soc15_common_late_init(void *handle)  {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+	int r;
 
 	if (amdgpu_sriov_vf(adev))
 		xgpu_ai_mailbox_get_irq(adev);
 
-	return 0;
+	if (adev->nbio.funcs->ras_late_init)
+		r = adev->nbio.funcs->ras_late_init(adev);
+
+	return r;
[Guchun]We'd better initialize the return value "r" first. If adev->nbio.funcs->ras_late_init is NULL, then we will return one value not initialized?
 }
 
 static int soc15_common_sw_init(void *handle) @@ -1287,6 +1291,13 @@ static int soc15_common_hw_fini(void *handle)
 	if (amdgpu_sriov_vf(adev))
 		xgpu_ai_mailbox_put_irq(adev);
 
+	if (amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
+		if (adev->nbio.funcs->init_ras_controller_interrupt)
+			amdgpu_irq_put(adev, &adev->nbio.ras_controller_irq, 0);
+		if (adev->nbio.funcs->init_ras_err_event_athub_interrupt)
+			amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0);
+	}
+
 	return 0;
 }
 
--
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* RE: [PATCH 1/7] drm/amdgpu: add helper function to do common ras_late_init
       [not found] ` <1566997395-7185-1-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
                     ` (5 preceding siblings ...)
  2019-08-28 13:03   ` [PATCH 7/7] drm/amdgpu: switch to ras_late_init callback " Hawking Zhang
@ 2019-08-29  2:59   ` Zhou1, Tao
       [not found]     ` <MN2PR12MB305488AF7BE5CD29B8E6E152B0A20-rweVpJHSKTqnT25eLM+iUQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  6 siblings, 1 reply; 18+ messages in thread
From: Zhou1, Tao @ 2019-08-29  2:59 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Deucher, Alexander
  Cc: Zhang, Hawking



> -----Original Message-----
> From: Hawking Zhang <Hawking.Zhang@amd.com>
> Sent: 2019年8月28日 21:03
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>;
> Deucher, Alexander <Alexander.Deucher@amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: [PATCH 1/7] drm/amdgpu: add helper function to do common
> ras_late_init
> 
> In late_init for ras, the helper function will be used to 1). disable ras feature
> if the IP block is masked as disabled 2). send enable feature command if the
> ip block was masked as enabled 3). create debugfs/sysfs node per IP block 4).
> register interrupt handler
> 
> Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 57
> +++++++++++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  4 +++
>  2 files changed, 61 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 230f7e6..2c32f99 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1564,6 +1564,63 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>  	return -EINVAL;
>  }
> 
> +/* helper function to handle common stuff in ip late init phase */ int
> +amdgpu_ras_late_init(struct amdgpu_device *adev,
> +			 struct ras_common_if *ras_block,
> +			 struct ras_fs_if *fs_info,
> +			 struct ras_ih_if *ih_info)
> +{
> +	int r;
> +
> +	/* disable RAS feature per IP block if it is not supported */
> +	if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
> +		amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
> +		return 0;
> +	}
> +
> +	r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
> +	if (r) {
> +		if (r == -EAGAIN) {
> +			/* request gpu reset. will run again */
> +			amdgpu_ras_request_reset_on_boot(adev,
> +					ras_block->block);
> +			return 0;
> +		}
> +		/* in resume phase, if fail to enable ras,
> +		 * clean up all ras fs nodes, and disable ras */
> +		if (adev->in_suspend)
> +			goto cleanup;
> +	}
> +
> +	/* in resume phase, no need to create ras fs node */
> +	if (adev->in_suspend)
> +		return 0;
> +
> +	if (ras_block->block == AMDGPU_RAS_BLOCK__UMC ||
> +	    ras_block->block == AMDGPU_RAS_BLOCK__SDMA ||
> +	    ras_block->block == AMDGPU_RAS_BLOCK__GFX) {
[Tao] we can set ih_info to NULL if a ras block has no interrupt and change the condition to "if (ih_info)"

> +		r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
> +		if (r)
> +			goto interrupt;
> +	}
> +
> +	amdgpu_ras_debugfs_create(adev, fs_info);
> +
> +	r = amdgpu_ras_sysfs_create(adev, fs_info);
> +	if (r)
> +		goto sysfs;
> +
> +	return 0;
> +cleanup:
> +	amdgpu_ras_sysfs_remove(adev, ras_block);
> +sysfs:
> +	amdgpu_ras_debugfs_remove(adev, ras_block);
> +	amdgpu_ras_interrupt_remove_handler(adev, ih_info);
[Tao] lack of if condition

> +interrupt:
> +	amdgpu_ras_feature_enable(adev, ras_block, 0);
> +	return r;
> +}
> +
>  /* do some init work after IP late init as dependence.
>   * and it runs in resume/gpu reset/booting up cases.
>   */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 6c76bb2..5212961 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -566,6 +566,10 @@ amdgpu_ras_error_to_ta(enum
> amdgpu_ras_error_type error) {  int amdgpu_ras_init(struct amdgpu_device
> *adev);  int amdgpu_ras_fini(struct amdgpu_device *adev);  int
> amdgpu_ras_pre_fini(struct amdgpu_device *adev);
> +int amdgpu_ras_late_init(struct amdgpu_device *adev,
> +			 struct ras_common_if *ras_block,
> +			 struct ras_fs_if *fs_info,
> +			 struct ras_ih_if *ih_info);
> 
>  int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
>  		struct ras_common_if *head, bool enable);
> --
> 2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH 2/7] drm/amdgpu: switch to amdgpu_ras_late_init for gfx v9 block
       [not found]     ` <1566997395-7185-2-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
@ 2019-08-29  3:26       ` Zhou1, Tao
  0 siblings, 0 replies; 18+ messages in thread
From: Zhou1, Tao @ 2019-08-29  3:26 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Deucher, Alexander
  Cc: Zhang, Hawking



> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> Hawking Zhang
> Sent: 2019年8月28日 21:03
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>;
> Deucher, Alexander <Alexander.Deucher@amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: [PATCH 2/7] drm/amdgpu: switch to amdgpu_ras_late_init for gfx v9
> block
> 
> call helper function in late init phase to handle ras init for gfx ip block
> 
> Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 90 ++++++++------------------------
> ---
>  1 file changed, 19 insertions(+), 71 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index a6bcbde..62ec451 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4389,7 +4389,6 @@ static int gfx_v9_0_process_ras_data_cb(struct
> amdgpu_device *adev,  static int gfx_v9_0_ecc_late_init(void *handle)  {
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> -	struct ras_common_if **ras_if = &adev->gfx.ras_if;
>  	struct ras_ih_if ih_info = {
>  		.cb = gfx_v9_0_process_ras_data_cb,
>  	};
> @@ -4397,18 +4396,18 @@ static int gfx_v9_0_ecc_late_init(void *handle)
>  		.sysfs_name = "gfx_err_count",
>  		.debugfs_name = "gfx_err_inject",
>  	};
> -	struct ras_common_if ras_block = {
> -		.block = AMDGPU_RAS_BLOCK__GFX,
> -		.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
> -		.sub_block_index = 0,
> -		.name = "gfx",
> -	};
>  	int r;
> 
> -	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
> -		amdgpu_ras_feature_enable_on_boot(adev, &ras_block, 0);
> -		return 0;
> +	if (!adev->gfx.ras_if) {
> +		adev->gfx.ras_if = kmalloc(sizeof(struct ras_common_if),
> GFP_KERNEL);
> +		if (!adev->gfx.ras_if)
> +			return -ENOMEM;
> +		adev->gfx.ras_if->block = AMDGPU_RAS_BLOCK__GFX;
> +		adev->gfx.ras_if->type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> +		adev->gfx.ras_if->sub_block_index = 0;
> +		strcpy(adev->gfx.ras_if->name, "gfx");
>  	}
> +	fs_info.head = ih_info.head = *adev->gfx.ras_if;
> 
>  	r = gfx_v9_0_do_edc_gds_workarounds(adev);
>  	if (r)
> @@ -4419,71 +4418,20 @@ static int gfx_v9_0_ecc_late_init(void *handle)
>  	if (r)
>  		return r;
> 
> -	/* handle resume path. */
> -	if (*ras_if) {
> -		/* resend ras TA enable cmd during resume.
> -		 * prepare to handle failure.
> -		 */
> -		ih_info.head = **ras_if;
> -		r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1);
> -		if (r) {
> -			if (r == -EAGAIN) {
> -				/* request a gpu reset. will run again. */
> -				amdgpu_ras_request_reset_on_boot(adev,
> -						AMDGPU_RAS_BLOCK__GFX);
> -				return 0;
> -			}
> -			/* fail to enable ras, cleanup all. */
> -			goto irq;
> -		}
> -		/* enable successfully. continue. */
> -		goto resume;
> -	}
> -
> -	*ras_if = kmalloc(sizeof(**ras_if), GFP_KERNEL);
> -	if (!*ras_if)
> -		return -ENOMEM;
> -
> -	**ras_if = ras_block;
> -
> -	r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1);
> -	if (r) {
> -		if (r == -EAGAIN) {
> -			amdgpu_ras_request_reset_on_boot(adev,
> -					AMDGPU_RAS_BLOCK__GFX);
> -			r = 0;
> -		}
> -		goto feature;
> -	}
> -
> -	ih_info.head = **ras_if;
> -	fs_info.head = **ras_if;
> -
> -	r = amdgpu_ras_interrupt_add_handler(adev, &ih_info);
> +	r = amdgpu_ras_late_init(adev, adev->gfx.ras_if,
> +				 &fs_info, &ih_info);
>  	if (r)
> -		goto interrupt;
> +		goto free;
> 
> -	amdgpu_ras_debugfs_create(adev, &fs_info);
> -
> -	r = amdgpu_ras_sysfs_create(adev, &fs_info);
> -	if (r)
> -		goto sysfs;
> -resume:
> -	r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
> -	if (r)
> -		goto irq;
[Tao] Is it necessary to add a amdgpu_ras_late_fini to free all the resources allocated by ras_late_init in failed path?

> +	if (amdgpu_ras_is_supported(adev, adev->gfx.ras_if->block)) {
> +		r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
> +		if (r)
> +			goto free;
> +	}
> 
>  	return 0;
> -irq:
> -	amdgpu_ras_sysfs_remove(adev, *ras_if);
> -sysfs:
> -	amdgpu_ras_debugfs_remove(adev, *ras_if);
> -	amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
> -interrupt:
> -	amdgpu_ras_feature_enable(adev, *ras_if, 0);
> -feature:
> -	kfree(*ras_if);
> -	*ras_if = NULL;
> +free:
> +	kfree(adev->gfx.ras_if);
>  	return r;
>  }
> 
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH 5/7] drm/amdgpu: add mmhub ras_late_init callback function
       [not found]     ` <1566997395-7185-5-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
@ 2019-08-29  3:40       ` Zhou1, Tao
       [not found]         ` <MN2PR12MB3054E5F6FC389201F8C841A1B0A20-rweVpJHSKTqnT25eLM+iUQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Zhou1, Tao @ 2019-08-29  3:40 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Deucher, Alexander
  Cc: Zhang, Hawking



> -----Original Message-----
> From: Hawking Zhang <Hawking.Zhang@amd.com>
> Sent: 2019年8月28日 21:03
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>;
> Deucher, Alexander <Alexander.Deucher@amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: [PATCH 5/7] drm/amdgpu: add mmhub ras_late_init callback
> function
> 
> The function will be called in late init phase to do mmhub ras init
> 
> Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h |  1 +
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c     | 21 ++-------------------
>  drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c   | 28
> ++++++++++++++++++++++++++++
>  3 files changed, 31 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
> index 2d75ecf..df04c71 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
> @@ -23,6 +23,7 @@
> 
>  struct amdgpu_mmhub_funcs {
>  	void (*ras_init)(struct amdgpu_device *adev);
> +	int (*ras_late_init)(struct amdgpu_device *adev);
>  	void (*query_ras_error_count)(struct amdgpu_device *adev,
>  					void *ras_error_status);
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 8dc13d2..26a6956 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -762,7 +762,6 @@ static int gmc_v9_0_ecc_late_init(void *handle)  {
>  	int r;
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> -	struct ras_ih_if mmhub_ih_info;
>  	struct ras_fs_if umc_fs_info = {
>  		.sysfs_name = "umc_err_count",
>  		.debugfs_name = "umc_err_inject",
> @@ -770,10 +769,6 @@ static int gmc_v9_0_ecc_late_init(void *handle)
>  	struct ras_ih_if umc_ih_info = {
>  		.cb = gmc_v9_0_process_ras_data_cb,
>  	};
> -	struct ras_fs_if mmhub_fs_info = {
> -		.sysfs_name = "mmhub_err_count",
> -		.debugfs_name = "mmhub_err_inject",
> -	};
> 
>  	if (!adev->gmc.umc_ras_if) {
>  		adev->gmc.umc_ras_if = kmalloc(sizeof(struct
> ras_common_if), GFP_KERNEL); @@ -797,25 +792,13 @@ static int
> gmc_v9_0_ecc_late_init(void *handle)
>  			goto free;
>  	}
> 
> -	if (!adev->gmc.mmhub_ras_if) {
> -		adev->gmc.mmhub_ras_if = kmalloc(sizeof(struct
> ras_common_if), GFP_KERNEL);
> -		if (!adev->gmc.mmhub_ras_if)
> -			return -ENOMEM;
> -		adev->gmc.mmhub_ras_if->block =
> AMDGPU_RAS_BLOCK__MMHUB;
> -		adev->gmc.mmhub_ras_if->type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> -		adev->gmc.mmhub_ras_if->sub_block_index = 0;
> -		strcpy(adev->gmc.mmhub_ras_if->name, "mmhub");
> -	}
> -	mmhub_ih_info.head = mmhub_fs_info.head = *adev-
> >gmc.mmhub_ras_if;
> -	r = amdgpu_ras_late_init(adev, adev->gmc.mmhub_ras_if,
> -				 &mmhub_fs_info, &mmhub_ih_info);
> +	r = adev->mmhub_funcs->ras_late_init(adev);
[Tao] It's better to add "if (adev->mmhub_funcs->ras_late_init(adev))"

>  	if (r)
> -		goto free;
> +		return r;
> 
>  	return 0;
>  free:
>  	kfree(adev->gmc.umc_ras_if);
> -	kfree(adev->gmc.mmhub_ras_if);
>  	return r;
>  }
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> index 04cd4b6..9f7d5d1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> @@ -31,6 +31,7 @@
>  #include "vega10_enum.h"
> 
>  #include "soc15_common.h"
> +#include "amdgpu_ras.h"
> 
>  #define mmDAGB0_CNTL_MISC2_RV 0x008f
>  #define mmDAGB0_CNTL_MISC2_RV_BASE_IDX 0 @@ -615,6 +616,33 @@
> static void mmhub_v1_0_query_ras_error_count(struct amdgpu_device
> *adev,
>  	}
>  }
> 
> +static int mmhub_v1_0_ras_late_init(struct amdgpu_device *adev) {
> +	int r;
> +	struct ras_ih_if mmhub_ih_info;
> +	struct ras_fs_if mmhub_fs_info = {
> +		.sysfs_name = "mmhub_err_count",
> +		.debugfs_name = "mmhub_err_inject",
> +	};
> +
> +	if (!adev->gmc.mmhub_ras_if) {
> +		adev->gmc.mmhub_ras_if = kmalloc(sizeof(struct
> ras_common_if), GFP_KERNEL);
> +		if (!adev->gmc.mmhub_ras_if)
> +			return -ENOMEM;
> +		adev->gmc.mmhub_ras_if->block =
> AMDGPU_RAS_BLOCK__MMHUB;
> +		adev->gmc.mmhub_ras_if->type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> +		adev->gmc.mmhub_ras_if->sub_block_index = 0;
> +		strcpy(adev->gmc.mmhub_ras_if->name, "mmhub");
> +	}
> +	mmhub_ih_info.head = mmhub_fs_info.head = *adev-
> >gmc.mmhub_ras_if;
> +	r = amdgpu_ras_late_init(adev, adev->gmc.mmhub_ras_if,
> +				 &mmhub_fs_info, &mmhub_ih_info);
> +	if (r)
> +		kfree(adev->gmc.mmhub_ras_if);
> +	return r;
> +}
> +
>  const struct amdgpu_mmhub_funcs mmhub_v1_0_funcs = {
> +	.ras_late_init = mmhub_v1_0_ras_late_init,
>  	.query_ras_error_count = mmhub_v1_0_query_ras_error_count,  };
> --
> 2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH 6/7] drm/amdgpu: add ras_late_init callback function for nbio v7_4
       [not found]     ` <1566997395-7185-6-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
@ 2019-08-29  3:48       ` Zhou1, Tao
       [not found]         ` <MN2PR12MB30542D58869DF3B4883459C0B0A20-rweVpJHSKTqnT25eLM+iUQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Zhou1, Tao @ 2019-08-29  3:48 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Deucher, Alexander
  Cc: Zhang, Hawking



> -----Original Message-----
> From: Hawking Zhang <Hawking.Zhang@amd.com>
> Sent: 2019年8月28日 21:03
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>;
> Deucher, Alexander <Alexander.Deucher@amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: [PATCH 6/7] drm/amdgpu: add ras_late_init callback function for
> nbio v7_4
> 
> ras_late_init callback function will be used to do common ras init in late init
> phase.
> 
> Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  2 ++
>  drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c   | 41
> ++++++++++++++++++++++++++++++++
>  2 files changed, 43 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> index a04c5ea..51078da6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> @@ -81,12 +81,14 @@ struct amdgpu_nbio_funcs {
>  	void (*handle_ras_err_event_athub_intr_no_bifring)(struct
> amdgpu_device *adev);
>  	int (*init_ras_controller_interrupt)(struct amdgpu_device *adev);
>  	int (*init_ras_err_event_athub_interrupt)(struct amdgpu_device
> *adev);
> +	int (*ras_late_init)(struct amdgpu_device *adev);
>  };
> 
>  struct amdgpu_nbio {
>  	const struct nbio_hdp_flush_reg *hdp_flush_reg;
>  	struct amdgpu_irq_src ras_controller_irq;
>  	struct amdgpu_irq_src ras_err_event_athub_irq;
> +	struct ras_common_if *ras_if;
>  	const struct amdgpu_nbio_funcs *funcs;  };
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> index faf9300..367f9d6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> @@ -23,6 +23,7 @@
>  #include "amdgpu.h"
>  #include "amdgpu_atombios.h"
>  #include "nbio_v7_4.h"
> +#include "amdgpu_ras.h"
> 
>  #include "nbio/nbio_7_4_offset.h"
>  #include "nbio/nbio_7_4_sh_mask.h"
> @@ -468,6 +469,45 @@ static int
> nbio_v7_4_init_ras_err_event_athub_interrupt (struct amdgpu_device *a
>  	return 0;
>  }
> 
> +static int nbio_v7_4_ras_late_init(struct amdgpu_device *adev) {
> +	int r;
> +	struct ras_ih_if ih_info;
> +	struct ras_fs_if fs_info = {
> +		.sysfs_name = "nbio_err_count",
> +		.debugfs_name = "nbio_err_inject",
[Tao] The ras block name is AMDGPU_RAS_BLOCK_PCIE_BIF and its string name is pcie_bif in ras_block_string,
QA may be confused in the future.

> +	};
> +
> +	if (!adev->nbio.ras_if) {
> +		adev->nbio.ras_if = kmalloc(sizeof(struct ras_common_if),
> GFP_KERNEL);
> +		if (!adev->nbio.ras_if)
> +			return -ENOMEM;
> +		adev->nbio.ras_if->block = AMDGPU_RAS_BLOCK__PCIE_BIF;
> +		adev->nbio.ras_if->type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> +		adev->nbio.ras_if->sub_block_index = 0;
> +		strcpy(adev->nbio.ras_if->name, "nbio");
> +	}
> +	ih_info.head = fs_info.head = *adev->nbio.ras_if;
> +	r = amdgpu_ras_late_init(adev, adev->nbio.ras_if,
> +				 &fs_info, &ih_info);
> +	if (r)
> +		goto free;
> +
> +	if (amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
> +		r = amdgpu_irq_get(adev, &adev->nbio.ras_controller_irq, 0);
> +		if (r)
> +			goto free;
> +		r = amdgpu_irq_get(adev, &adev-
> >nbio.ras_err_event_athub_irq, 0);
> +		if (r)
> +			goto free;
> +	}
> +
> +	return 0;
> +free:
> +	kfree(adev->nbio.ras_if);
> +	return r;
> +}
> +
>  const struct amdgpu_nbio_funcs nbio_v7_4_funcs = {
>  	.get_hdp_flush_req_offset = nbio_v7_4_get_hdp_flush_req_offset,
>  	.get_hdp_flush_done_offset =
> nbio_v7_4_get_hdp_flush_done_offset,
> @@ -493,4 +533,5 @@ const struct amdgpu_nbio_funcs nbio_v7_4_funcs = {
>  	.handle_ras_err_event_athub_intr_no_bifring =
> nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring,
>  	.init_ras_controller_interrupt =
> nbio_v7_4_init_ras_controller_interrupt,
>  	.init_ras_err_event_athub_interrupt =
> nbio_v7_4_init_ras_err_event_athub_interrupt,
> +	.ras_late_init = nbio_v7_4_ras_late_init,
>  };
> --
> 2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH 1/7] drm/amdgpu: add helper function to do common ras_late_init
       [not found]     ` <MN2PR12MB305488AF7BE5CD29B8E6E152B0A20-rweVpJHSKTqnT25eLM+iUQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2019-08-29  3:52       ` Zhou1, Tao
       [not found]         ` <MN2PR12MB305479DB08A348F173060E23B0A20-rweVpJHSKTqnT25eLM+iUQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Zhou1, Tao @ 2019-08-29  3:52 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Deucher, Alexander
  Cc: Zhang, Hawking

Another way is to add check for ih_info in amdgpu_ras_interrupt_add_handler and amdgpu_ras_interrupt_remove_handler directly.

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> Zhou1, Tao
> Sent: 2019年8月29日 10:59
> To: Zhang, Hawking <Hawking.Zhang@amd.com>; amd-
> gfx@lists.freedesktop.org; Deucher, Alexander
> <Alexander.Deucher@amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: RE: [PATCH 1/7] drm/amdgpu: add helper function to do common
> ras_late_init
> 
> 
> 
> > -----Original Message-----
> > From: Hawking Zhang <Hawking.Zhang@amd.com>
> > Sent: 2019年8月28日 21:03
> > To: amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>;
> > Deucher, Alexander <Alexander.Deucher@amd.com>
> > Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
> > Subject: [PATCH 1/7] drm/amdgpu: add helper function to do common
> > ras_late_init
> >
> > In late_init for ras, the helper function will be used to 1). disable
> > ras feature if the IP block is masked as disabled 2). send enable
> > feature command if the ip block was masked as enabled 3). create
> debugfs/sysfs node per IP block 4).
> > register interrupt handler
> >
> > Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 57
> > +++++++++++++++++++++++++++++++++
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  4 +++
> >  2 files changed, 61 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > index 230f7e6..2c32f99 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > @@ -1564,6 +1564,63 @@ int amdgpu_ras_init(struct amdgpu_device
> *adev)
> >  	return -EINVAL;
> >  }
> >
> > +/* helper function to handle common stuff in ip late init phase */
> > +int amdgpu_ras_late_init(struct amdgpu_device *adev,
> > +			 struct ras_common_if *ras_block,
> > +			 struct ras_fs_if *fs_info,
> > +			 struct ras_ih_if *ih_info)
> > +{
> > +	int r;
> > +
> > +	/* disable RAS feature per IP block if it is not supported */
> > +	if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
> > +		amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
> > +		return 0;
> > +	}
> > +
> > +	r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
> > +	if (r) {
> > +		if (r == -EAGAIN) {
> > +			/* request gpu reset. will run again */
> > +			amdgpu_ras_request_reset_on_boot(adev,
> > +					ras_block->block);
> > +			return 0;
> > +		}
> > +		/* in resume phase, if fail to enable ras,
> > +		 * clean up all ras fs nodes, and disable ras */
> > +		if (adev->in_suspend)
> > +			goto cleanup;
> > +	}
> > +
> > +	/* in resume phase, no need to create ras fs node */
> > +	if (adev->in_suspend)
> > +		return 0;
> > +
> > +	if (ras_block->block == AMDGPU_RAS_BLOCK__UMC ||
> > +	    ras_block->block == AMDGPU_RAS_BLOCK__SDMA ||
> > +	    ras_block->block == AMDGPU_RAS_BLOCK__GFX) {
> [Tao] we can set ih_info to NULL if a ras block has no interrupt and change
> the condition to "if (ih_info)"
> 
> > +		r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
> > +		if (r)
> > +			goto interrupt;
> > +	}
> > +
> > +	amdgpu_ras_debugfs_create(adev, fs_info);
> > +
> > +	r = amdgpu_ras_sysfs_create(adev, fs_info);
> > +	if (r)
> > +		goto sysfs;
> > +
> > +	return 0;
> > +cleanup:
> > +	amdgpu_ras_sysfs_remove(adev, ras_block);
> > +sysfs:
> > +	amdgpu_ras_debugfs_remove(adev, ras_block);
> > +	amdgpu_ras_interrupt_remove_handler(adev, ih_info);
> [Tao] lack of if condition
> 
> > +interrupt:
> > +	amdgpu_ras_feature_enable(adev, ras_block, 0);
> > +	return r;
> > +}
> > +
> >  /* do some init work after IP late init as dependence.
> >   * and it runs in resume/gpu reset/booting up cases.
> >   */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > index 6c76bb2..5212961 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > @@ -566,6 +566,10 @@ amdgpu_ras_error_to_ta(enum
> amdgpu_ras_error_type
> > error) {  int amdgpu_ras_init(struct amdgpu_device *adev);  int
> > amdgpu_ras_fini(struct amdgpu_device *adev);  int
> > amdgpu_ras_pre_fini(struct amdgpu_device *adev);
> > +int amdgpu_ras_late_init(struct amdgpu_device *adev,
> > +			 struct ras_common_if *ras_block,
> > +			 struct ras_fs_if *fs_info,
> > +			 struct ras_ih_if *ih_info);
> >
> >  int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
> >  		struct ras_common_if *head, bool enable);
> > --
> > 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH 5/7] drm/amdgpu: add mmhub ras_late_init callback function
       [not found]         ` <MN2PR12MB3054E5F6FC389201F8C841A1B0A20-rweVpJHSKTqnT25eLM+iUQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2019-08-29  3:53           ` Zhou1, Tao
  0 siblings, 0 replies; 18+ messages in thread
From: Zhou1, Tao @ 2019-08-29  3:53 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Deucher, Alexander
  Cc: Zhang, Hawking

Can we also add a ras_late_init for umc?

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> Zhou1, Tao
> Sent: 2019年8月29日 11:41
> To: Zhang, Hawking <Hawking.Zhang@amd.com>; amd-
> gfx@lists.freedesktop.org; Deucher, Alexander
> <Alexander.Deucher@amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: RE: [PATCH 5/7] drm/amdgpu: add mmhub ras_late_init callback
> function
> 
> 
> 
> > -----Original Message-----
> > From: Hawking Zhang <Hawking.Zhang@amd.com>
> > Sent: 2019年8月28日 21:03
> > To: amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>;
> > Deucher, Alexander <Alexander.Deucher@amd.com>
> > Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
> > Subject: [PATCH 5/7] drm/amdgpu: add mmhub ras_late_init callback
> > function
> >
> > The function will be called in late init phase to do mmhub ras init
> >
> > Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h |  1 +
> >  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c     | 21 ++-------------------
> >  drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c   | 28
> > ++++++++++++++++++++++++++++
> >  3 files changed, 31 insertions(+), 19 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
> > index 2d75ecf..df04c71 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
> > @@ -23,6 +23,7 @@
> >
> >  struct amdgpu_mmhub_funcs {
> >  	void (*ras_init)(struct amdgpu_device *adev);
> > +	int (*ras_late_init)(struct amdgpu_device *adev);
> >  	void (*query_ras_error_count)(struct amdgpu_device *adev,
> >  					void *ras_error_status);
> >  };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > index 8dc13d2..26a6956 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > @@ -762,7 +762,6 @@ static int gmc_v9_0_ecc_late_init(void *handle)  {
> >  	int r;
> >  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> > -	struct ras_ih_if mmhub_ih_info;
> >  	struct ras_fs_if umc_fs_info = {
> >  		.sysfs_name = "umc_err_count",
> >  		.debugfs_name = "umc_err_inject",
> > @@ -770,10 +769,6 @@ static int gmc_v9_0_ecc_late_init(void *handle)
> >  	struct ras_ih_if umc_ih_info = {
> >  		.cb = gmc_v9_0_process_ras_data_cb,
> >  	};
> > -	struct ras_fs_if mmhub_fs_info = {
> > -		.sysfs_name = "mmhub_err_count",
> > -		.debugfs_name = "mmhub_err_inject",
> > -	};
> >
> >  	if (!adev->gmc.umc_ras_if) {
> >  		adev->gmc.umc_ras_if = kmalloc(sizeof(struct
> ras_common_if),
> > GFP_KERNEL); @@ -797,25 +792,13 @@ static int
> > gmc_v9_0_ecc_late_init(void *handle)
> >  			goto free;
> >  	}
> >
> > -	if (!adev->gmc.mmhub_ras_if) {
> > -		adev->gmc.mmhub_ras_if = kmalloc(sizeof(struct
> > ras_common_if), GFP_KERNEL);
> > -		if (!adev->gmc.mmhub_ras_if)
> > -			return -ENOMEM;
> > -		adev->gmc.mmhub_ras_if->block =
> > AMDGPU_RAS_BLOCK__MMHUB;
> > -		adev->gmc.mmhub_ras_if->type =
> > AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> > -		adev->gmc.mmhub_ras_if->sub_block_index = 0;
> > -		strcpy(adev->gmc.mmhub_ras_if->name, "mmhub");
> > -	}
> > -	mmhub_ih_info.head = mmhub_fs_info.head = *adev-
> > >gmc.mmhub_ras_if;
> > -	r = amdgpu_ras_late_init(adev, adev->gmc.mmhub_ras_if,
> > -				 &mmhub_fs_info, &mmhub_ih_info);
> > +	r = adev->mmhub_funcs->ras_late_init(adev);
> [Tao] It's better to add "if (adev->mmhub_funcs->ras_late_init(adev))"
> 
> >  	if (r)
> > -		goto free;
> > +		return r;
> >
> >  	return 0;
> >  free:
> >  	kfree(adev->gmc.umc_ras_if);
> > -	kfree(adev->gmc.mmhub_ras_if);
> >  	return r;
> >  }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> > b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> > index 04cd4b6..9f7d5d1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> > @@ -31,6 +31,7 @@
> >  #include "vega10_enum.h"
> >
> >  #include "soc15_common.h"
> > +#include "amdgpu_ras.h"
> >
> >  #define mmDAGB0_CNTL_MISC2_RV 0x008f
> >  #define mmDAGB0_CNTL_MISC2_RV_BASE_IDX 0 @@ -615,6 +616,33 @@
> static
> > void mmhub_v1_0_query_ras_error_count(struct amdgpu_device *adev,
> >  	}
> >  }
> >
> > +static int mmhub_v1_0_ras_late_init(struct amdgpu_device *adev) {
> > +	int r;
> > +	struct ras_ih_if mmhub_ih_info;
> > +	struct ras_fs_if mmhub_fs_info = {
> > +		.sysfs_name = "mmhub_err_count",
> > +		.debugfs_name = "mmhub_err_inject",
> > +	};
> > +
> > +	if (!adev->gmc.mmhub_ras_if) {
> > +		adev->gmc.mmhub_ras_if = kmalloc(sizeof(struct
> > ras_common_if), GFP_KERNEL);
> > +		if (!adev->gmc.mmhub_ras_if)
> > +			return -ENOMEM;
> > +		adev->gmc.mmhub_ras_if->block =
> > AMDGPU_RAS_BLOCK__MMHUB;
> > +		adev->gmc.mmhub_ras_if->type =
> > AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> > +		adev->gmc.mmhub_ras_if->sub_block_index = 0;
> > +		strcpy(adev->gmc.mmhub_ras_if->name, "mmhub");
> > +	}
> > +	mmhub_ih_info.head = mmhub_fs_info.head = *adev-
> > >gmc.mmhub_ras_if;
> > +	r = amdgpu_ras_late_init(adev, adev->gmc.mmhub_ras_if,
> > +				 &mmhub_fs_info, &mmhub_ih_info);
> > +	if (r)
> > +		kfree(adev->gmc.mmhub_ras_if);
> > +	return r;
> > +}
> > +
> >  const struct amdgpu_mmhub_funcs mmhub_v1_0_funcs = {
> > +	.ras_late_init = mmhub_v1_0_ras_late_init,
> >  	.query_ras_error_count = mmhub_v1_0_query_ras_error_count,  };
> > --
> > 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH 1/7] drm/amdgpu: add helper function to do common ras_late_init
       [not found]         ` <MN2PR12MB305479DB08A348F173060E23B0A20-rweVpJHSKTqnT25eLM+iUQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2019-08-29  5:48           ` Zhang, Hawking
  0 siblings, 0 replies; 18+ messages in thread
From: Zhang, Hawking @ 2019-08-29  5:48 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Deucher, Alexander

Good point, I think we can check ih_info.cb, instead of ras_block, as the check condition. On the other hand, I initialized the header in ih_info in case someone use it in somewhere...

Regards,
Hawking
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com> 
Sent: 2019年8月29日 11:52
To: Zhou1, Tao <Tao.Zhou1@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; amd-gfx@lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher@amd.com>
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: RE: [PATCH 1/7] drm/amdgpu: add helper function to do common ras_late_init

Another way is to add check for ih_info in amdgpu_ras_interrupt_add_handler and amdgpu_ras_interrupt_remove_handler directly.

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of 
> Zhou1, Tao
> Sent: 2019年8月29日 10:59
> To: Zhang, Hawking <Hawking.Zhang@amd.com>; amd- 
> gfx@lists.freedesktop.org; Deucher, Alexander 
> <Alexander.Deucher@amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: RE: [PATCH 1/7] drm/amdgpu: add helper function to do common 
> ras_late_init
> 
> 
> 
> > -----Original Message-----
> > From: Hawking Zhang <Hawking.Zhang@amd.com>
> > Sent: 2019年8月28日 21:03
> > To: amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>; 
> > Deucher, Alexander <Alexander.Deucher@amd.com>
> > Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
> > Subject: [PATCH 1/7] drm/amdgpu: add helper function to do common 
> > ras_late_init
> >
> > In late_init for ras, the helper function will be used to 1). 
> > disable ras feature if the IP block is masked as disabled 2). send 
> > enable feature command if the ip block was masked as enabled 3). 
> > create
> debugfs/sysfs node per IP block 4).
> > register interrupt handler
> >
> > Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 57
> > +++++++++++++++++++++++++++++++++
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  4 +++
> >  2 files changed, 61 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > index 230f7e6..2c32f99 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > @@ -1564,6 +1564,63 @@ int amdgpu_ras_init(struct amdgpu_device
> *adev)
> >  	return -EINVAL;
> >  }
> >
> > +/* helper function to handle common stuff in ip late init phase */ 
> > +int amdgpu_ras_late_init(struct amdgpu_device *adev,
> > +			 struct ras_common_if *ras_block,
> > +			 struct ras_fs_if *fs_info,
> > +			 struct ras_ih_if *ih_info)
> > +{
> > +	int r;
> > +
> > +	/* disable RAS feature per IP block if it is not supported */
> > +	if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
> > +		amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
> > +		return 0;
> > +	}
> > +
> > +	r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
> > +	if (r) {
> > +		if (r == -EAGAIN) {
> > +			/* request gpu reset. will run again */
> > +			amdgpu_ras_request_reset_on_boot(adev,
> > +					ras_block->block);
> > +			return 0;
> > +		}
> > +		/* in resume phase, if fail to enable ras,
> > +		 * clean up all ras fs nodes, and disable ras */
> > +		if (adev->in_suspend)
> > +			goto cleanup;
> > +	}
> > +
> > +	/* in resume phase, no need to create ras fs node */
> > +	if (adev->in_suspend)
> > +		return 0;
> > +
> > +	if (ras_block->block == AMDGPU_RAS_BLOCK__UMC ||
> > +	    ras_block->block == AMDGPU_RAS_BLOCK__SDMA ||
> > +	    ras_block->block == AMDGPU_RAS_BLOCK__GFX) {
> [Tao] we can set ih_info to NULL if a ras block has no interrupt and 
> change the condition to "if (ih_info)"
> 
> > +		r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
> > +		if (r)
> > +			goto interrupt;
> > +	}
> > +
> > +	amdgpu_ras_debugfs_create(adev, fs_info);
> > +
> > +	r = amdgpu_ras_sysfs_create(adev, fs_info);
> > +	if (r)
> > +		goto sysfs;
> > +
> > +	return 0;
> > +cleanup:
> > +	amdgpu_ras_sysfs_remove(adev, ras_block);
> > +sysfs:
> > +	amdgpu_ras_debugfs_remove(adev, ras_block);
> > +	amdgpu_ras_interrupt_remove_handler(adev, ih_info);
> [Tao] lack of if condition
> 
> > +interrupt:
> > +	amdgpu_ras_feature_enable(adev, ras_block, 0);
> > +	return r;
> > +}
> > +
> >  /* do some init work after IP late init as dependence.
> >   * and it runs in resume/gpu reset/booting up cases.
> >   */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > index 6c76bb2..5212961 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > @@ -566,6 +566,10 @@ amdgpu_ras_error_to_ta(enum
> amdgpu_ras_error_type
> > error) {  int amdgpu_ras_init(struct amdgpu_device *adev);  int 
> > amdgpu_ras_fini(struct amdgpu_device *adev);  int 
> > amdgpu_ras_pre_fini(struct amdgpu_device *adev);
> > +int amdgpu_ras_late_init(struct amdgpu_device *adev,
> > +			 struct ras_common_if *ras_block,
> > +			 struct ras_fs_if *fs_info,
> > +			 struct ras_ih_if *ih_info);
> >
> >  int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
> >  		struct ras_common_if *head, bool enable);
> > --
> > 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH 7/7] drm/amdgpu: switch to ras_late_init callback for nbio v7_4
       [not found]         ` <BYAPR12MB2806FE6895D90CD1AC1076D7F1A20-ZGDeBxoHBPk0CuAkIMgl3QdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2019-08-29  5:49           ` Zhang, Hawking
  0 siblings, 0 replies; 18+ messages in thread
From: Zhang, Hawking @ 2019-08-29  5:49 UTC (permalink / raw)
  To: Chen, Guchun, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Zhou1,
	Tao, Deucher, Alexander

Good catch. Will update it in v2.

Regards,
Hawking

-----Original Message-----
From: Chen, Guchun <Guchun.Chen@amd.com> 
Sent: 2019年8月29日 9:25
To: Zhang, Hawking <Hawking.Zhang@amd.com>; amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: RE: [PATCH 7/7] drm/amdgpu: switch to ras_late_init callback for nbio v7_4


Regards,
Guchun

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Hawking Zhang
Sent: Wednesday, August 28, 2019 9:03 PM
To: amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: [PATCH 7/7] drm/amdgpu: switch to ras_late_init callback for nbio v7_4

invoke nbio ras_late_init callback function to do nbio ras init

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/soc15.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index e7f2539..f53bd59 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -1206,11 +1206,15 @@ static int soc15_common_early_init(void *handle)  static int soc15_common_late_init(void *handle)  {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+	int r;
 
 	if (amdgpu_sriov_vf(adev))
 		xgpu_ai_mailbox_get_irq(adev);
 
-	return 0;
+	if (adev->nbio.funcs->ras_late_init)
+		r = adev->nbio.funcs->ras_late_init(adev);
+
+	return r;
[Guchun]We'd better initialize the return value "r" first. If adev->nbio.funcs->ras_late_init is NULL, then we will return one value not initialized?
 }
 
 static int soc15_common_sw_init(void *handle) @@ -1287,6 +1291,13 @@ static int soc15_common_hw_fini(void *handle)
 	if (amdgpu_sriov_vf(adev))
 		xgpu_ai_mailbox_put_irq(adev);
 
+	if (amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
+		if (adev->nbio.funcs->init_ras_controller_interrupt)
+			amdgpu_irq_put(adev, &adev->nbio.ras_controller_irq, 0);
+		if (adev->nbio.funcs->init_ras_err_event_athub_interrupt)
+			amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0);
+	}
+
 	return 0;
 }
 
--
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* RE: [PATCH 6/7] drm/amdgpu: add ras_late_init callback function for nbio v7_4
       [not found]         ` <MN2PR12MB30542D58869DF3B4883459C0B0A20-rweVpJHSKTqnT25eLM+iUQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2019-08-29  5:52           ` Zhang, Hawking
  0 siblings, 0 replies; 18+ messages in thread
From: Zhang, Hawking @ 2019-08-29  5:52 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Deucher, Alexander

RE - [Tao] The ras block name is AMDGPU_RAS_BLOCK_PCIE_BIF and its string name is pcie_bif in ras_block_string, QA may be confused in the future.

I have no strong opinion on the naming. but it's good to align with the block string to avoid confusing. Will update in v2.

Regards,
Hawking
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com> 
Sent: 2019年8月29日 11:49
To: Zhang, Hawking <Hawking.Zhang@amd.com>; amd-gfx@lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher@amd.com>
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: RE: [PATCH 6/7] drm/amdgpu: add ras_late_init callback function for nbio v7_4



> -----Original Message-----
> From: Hawking Zhang <Hawking.Zhang@amd.com>
> Sent: 2019年8月28日 21:03
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com>; 
> Deucher, Alexander <Alexander.Deucher@amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: [PATCH 6/7] drm/amdgpu: add ras_late_init callback function 
> for nbio v7_4
> 
> ras_late_init callback function will be used to do common ras init in 
> late init phase.
> 
> Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  2 ++
>  drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c   | 41
> ++++++++++++++++++++++++++++++++
>  2 files changed, 43 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> index a04c5ea..51078da6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> @@ -81,12 +81,14 @@ struct amdgpu_nbio_funcs {
>  	void (*handle_ras_err_event_athub_intr_no_bifring)(struct
> amdgpu_device *adev);
>  	int (*init_ras_controller_interrupt)(struct amdgpu_device *adev);
>  	int (*init_ras_err_event_athub_interrupt)(struct amdgpu_device 
> *adev);
> +	int (*ras_late_init)(struct amdgpu_device *adev);
>  };
> 
>  struct amdgpu_nbio {
>  	const struct nbio_hdp_flush_reg *hdp_flush_reg;
>  	struct amdgpu_irq_src ras_controller_irq;
>  	struct amdgpu_irq_src ras_err_event_athub_irq;
> +	struct ras_common_if *ras_if;
>  	const struct amdgpu_nbio_funcs *funcs;  };
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> index faf9300..367f9d6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> @@ -23,6 +23,7 @@
>  #include "amdgpu.h"
>  #include "amdgpu_atombios.h"
>  #include "nbio_v7_4.h"
> +#include "amdgpu_ras.h"
> 
>  #include "nbio/nbio_7_4_offset.h"
>  #include "nbio/nbio_7_4_sh_mask.h"
> @@ -468,6 +469,45 @@ static int
> nbio_v7_4_init_ras_err_event_athub_interrupt (struct amdgpu_device *a
>  	return 0;
>  }
> 
> +static int nbio_v7_4_ras_late_init(struct amdgpu_device *adev) {
> +	int r;
> +	struct ras_ih_if ih_info;
> +	struct ras_fs_if fs_info = {
> +		.sysfs_name = "nbio_err_count",
> +		.debugfs_name = "nbio_err_inject",
[Tao] The ras block name is AMDGPU_RAS_BLOCK_PCIE_BIF and its string name is pcie_bif in ras_block_string, QA may be confused in the future.

> +	};
> +
> +	if (!adev->nbio.ras_if) {
> +		adev->nbio.ras_if = kmalloc(sizeof(struct ras_common_if),
> GFP_KERNEL);
> +		if (!adev->nbio.ras_if)
> +			return -ENOMEM;
> +		adev->nbio.ras_if->block = AMDGPU_RAS_BLOCK__PCIE_BIF;
> +		adev->nbio.ras_if->type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> +		adev->nbio.ras_if->sub_block_index = 0;
> +		strcpy(adev->nbio.ras_if->name, "nbio");
> +	}
> +	ih_info.head = fs_info.head = *adev->nbio.ras_if;
> +	r = amdgpu_ras_late_init(adev, adev->nbio.ras_if,
> +				 &fs_info, &ih_info);
> +	if (r)
> +		goto free;
> +
> +	if (amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
> +		r = amdgpu_irq_get(adev, &adev->nbio.ras_controller_irq, 0);
> +		if (r)
> +			goto free;
> +		r = amdgpu_irq_get(adev, &adev-
> >nbio.ras_err_event_athub_irq, 0);
> +		if (r)
> +			goto free;
> +	}
> +
> +	return 0;
> +free:
> +	kfree(adev->nbio.ras_if);
> +	return r;
> +}
> +
>  const struct amdgpu_nbio_funcs nbio_v7_4_funcs = {
>  	.get_hdp_flush_req_offset = nbio_v7_4_get_hdp_flush_req_offset,
>  	.get_hdp_flush_done_offset =
> nbio_v7_4_get_hdp_flush_done_offset,
> @@ -493,4 +533,5 @@ const struct amdgpu_nbio_funcs nbio_v7_4_funcs = {
>  	.handle_ras_err_event_athub_intr_no_bifring = 
> nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring,
>  	.init_ras_controller_interrupt =
> nbio_v7_4_init_ras_controller_interrupt,
>  	.init_ras_err_event_athub_interrupt = 
> nbio_v7_4_init_ras_err_event_athub_interrupt,
> +	.ras_late_init = nbio_v7_4_ras_late_init,
>  };
> --
> 2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2019-08-29  5:52 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-28 13:03 [PATCH 1/7] drm/amdgpu: add helper function to do common ras_late_init Hawking Zhang
     [not found] ` <1566997395-7185-1-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
2019-08-28 13:03   ` [PATCH 2/7] drm/amdgpu: switch to amdgpu_ras_late_init for gfx v9 block Hawking Zhang
     [not found]     ` <1566997395-7185-2-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
2019-08-29  3:26       ` Zhou1, Tao
2019-08-28 13:03   ` [PATCH 3/7] drm/amdgpu: switch to ras_late_init for sdma v4 block Hawking Zhang
2019-08-28 13:03   ` [PATCH 4/7] drm/amdgpu: switch to ras_late_init for gmc v9 Hawking Zhang
2019-08-28 13:03   ` [PATCH 5/7] drm/amdgpu: add mmhub ras_late_init callback function Hawking Zhang
     [not found]     ` <1566997395-7185-5-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
2019-08-29  3:40       ` Zhou1, Tao
     [not found]         ` <MN2PR12MB3054E5F6FC389201F8C841A1B0A20-rweVpJHSKTqnT25eLM+iUQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-08-29  3:53           ` Zhou1, Tao
2019-08-28 13:03   ` [PATCH 6/7] drm/amdgpu: add ras_late_init callback function for nbio v7_4 Hawking Zhang
     [not found]     ` <1566997395-7185-6-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
2019-08-29  3:48       ` Zhou1, Tao
     [not found]         ` <MN2PR12MB30542D58869DF3B4883459C0B0A20-rweVpJHSKTqnT25eLM+iUQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-08-29  5:52           ` Zhang, Hawking
2019-08-28 13:03   ` [PATCH 7/7] drm/amdgpu: switch to ras_late_init callback " Hawking Zhang
     [not found]     ` <1566997395-7185-7-git-send-email-Hawking.Zhang-5C7GfCeVMHo@public.gmane.org>
2019-08-28 17:29       ` Deucher, Alexander
2019-08-29  1:25       ` Chen, Guchun
     [not found]         ` <BYAPR12MB2806FE6895D90CD1AC1076D7F1A20-ZGDeBxoHBPk0CuAkIMgl3QdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-08-29  5:49           ` Zhang, Hawking
2019-08-29  2:59   ` [PATCH 1/7] drm/amdgpu: add helper function to do common ras_late_init Zhou1, Tao
     [not found]     ` <MN2PR12MB305488AF7BE5CD29B8E6E152B0A20-rweVpJHSKTqnT25eLM+iUQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-08-29  3:52       ` Zhou1, Tao
     [not found]         ` <MN2PR12MB305479DB08A348F173060E23B0A20-rweVpJHSKTqnT25eLM+iUQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-08-29  5:48           ` Zhang, Hawking

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.