All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt
@ 2022-12-07 10:03 Tao Zhou
  2022-12-07 10:03 ` [PATCH 2/6] drm/amdgpu: add RAS poison consumption handler for NV SRIOV Tao Zhou
                   ` (5 more replies)
  0 siblings, 6 replies; 10+ messages in thread
From: Tao Zhou @ 2022-12-07 10:03 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, Gavin.Wan, Vignesh.Chander,
	david.yu
  Cc: Tao Zhou

PF will do page retirement, reset VF and inform VF to reserve RAS
bad pages.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c  | 44 ++++++++++++++----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c    |  6 ++++
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h    |  1 +
 4 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index f76c19fc0392..1c7fcb4f2380 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -169,25 +169,33 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
 {
 	int ret = AMDGPU_RAS_SUCCESS;
 
-	if (!adev->gmc.xgmi.connected_to_cpu) {
-		struct ras_err_data err_data = {0, 0, 0, NULL};
-		struct ras_common_if head = {
-			.block = AMDGPU_RAS_BLOCK__UMC,
-		};
-		struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
-
-		ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
-
-		if (ret == AMDGPU_RAS_SUCCESS && obj) {
-			obj->err_data.ue_count += err_data.ue_count;
-			obj->err_data.ce_count += err_data.ce_count;
+	if (!amdgpu_sriov_vf(adev)) {
+		if (!adev->gmc.xgmi.connected_to_cpu) {
+			struct ras_err_data err_data = {0, 0, 0, NULL};
+			struct ras_common_if head = {
+				.block = AMDGPU_RAS_BLOCK__UMC,
+			};
+			struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+
+			ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
+
+			if (ret == AMDGPU_RAS_SUCCESS && obj) {
+				obj->err_data.ue_count += err_data.ue_count;
+				obj->err_data.ce_count += err_data.ce_count;
+			}
+		} else if (reset) {
+			/* MCA poison handler is only responsible for GPU reset,
+			 * let MCA notifier do page retirement.
+			 */
+			kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+			amdgpu_ras_reset_gpu(adev);
 		}
-	} else if (reset) {
-		/* MCA poison handler is only responsible for GPU reset,
-		 * let MCA notifier do page retirement.
-		 */
-		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
-		amdgpu_ras_reset_gpu(adev);
+	} else {
+		if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
+			adev->virt.ops->ras_poison_handler(adev);
+		else
+			dev_warn(adev->dev,
+				"No ras_poison_handler interface in SRIOV!\n");
 	}
 
 	return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 2b9d806e23af..b9e9480448af 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -88,6 +88,7 @@ struct amdgpu_virt_ops {
 	int (*wait_reset)(struct amdgpu_device *adev);
 	void (*trans_msg)(struct amdgpu_device *adev, enum idh_request req,
 			  u32 data1, u32 data2, u32 data3);
+	void (*ras_poison_handler)(struct amdgpu_device *adev);
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 12906ba74462..63725b2ebc03 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -404,6 +404,11 @@ static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
 	return xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_INIT_DATA);
 }
 
+static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev)
+{
+	xgpu_ai_send_access_requests(adev, IDH_RAS_POISON);
+}
+
 const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
 	.req_full_gpu	= xgpu_ai_request_full_gpu_access,
 	.rel_full_gpu	= xgpu_ai_release_full_gpu_access,
@@ -411,4 +416,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
 	.wait_reset = NULL,
 	.trans_msg = xgpu_ai_mailbox_trans_msg,
 	.req_init_data  = xgpu_ai_request_init_data,
+	.ras_poison_handler = xgpu_ai_ras_poison_handler,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
index fa7e13e0459e..0136bd059f68 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
@@ -39,6 +39,7 @@ enum idh_request {
 
 	IDH_LOG_VF_ERROR       = 200,
 	IDH_READY_TO_RESET 	= 201,
+	IDH_RAS_POISON		= 202,
 };
 
 enum idh_event {
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/6] drm/amdgpu: add RAS poison consumption handler for NV SRIOV
  2022-12-07 10:03 [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Tao Zhou
@ 2022-12-07 10:03 ` Tao Zhou
  2022-12-07 10:03 ` [PATCH 3/6] drm/amdgpu: add VCN poison consumption handler for SRIOV Tao Zhou
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 10+ messages in thread
From: Tao Zhou @ 2022-12-07 10:03 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, Gavin.Wan, Vignesh.Chander,
	david.yu
  Cc: Tao Zhou

Send handling request to host.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 6 ++++++
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h | 1 +
 2 files changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index e07757eea7ad..cae1aaa4ddb6 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -426,6 +426,11 @@ void xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
 	amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
 }
 
+static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev)
+{
+	xgpu_nv_send_access_requests(adev, IDH_RAS_POISON);
+}
+
 const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
 	.req_full_gpu	= xgpu_nv_request_full_gpu_access,
 	.rel_full_gpu	= xgpu_nv_release_full_gpu_access,
@@ -433,4 +438,5 @@ const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
 	.reset_gpu = xgpu_nv_request_reset,
 	.wait_reset = NULL,
 	.trans_msg = xgpu_nv_mailbox_trans_msg,
+	.ras_poison_handler = xgpu_nv_ras_poison_handler,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
index 73887b0aa1d6..0331d9c1a09b 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
@@ -39,6 +39,7 @@ enum idh_request {
 
 	IDH_LOG_VF_ERROR	= 200,
 	IDH_READY_TO_RESET 	= 201,
+	IDH_RAS_POISON		= 202,
 };
 
 enum idh_event {
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/6] drm/amdgpu: add VCN poison consumption handler for SRIOV
  2022-12-07 10:03 [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Tao Zhou
  2022-12-07 10:03 ` [PATCH 2/6] drm/amdgpu: add RAS poison consumption handler for NV SRIOV Tao Zhou
@ 2022-12-07 10:03 ` Tao Zhou
  2022-12-07 10:03 ` [PATCH 4/6] drm/amdgpu: skip RAS error injection in SRIOV Tao Zhou
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 10+ messages in thread
From: Tao Zhou @ 2022-12-07 10:03 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, Gavin.Wan, Vignesh.Chander,
	david.yu
  Cc: Tao Zhou

Inform host and let host handle consumption interrupt.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 72fa14ff862f..a23e26b272b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1246,8 +1246,16 @@ int amdgpu_vcn_process_poison_irq(struct amdgpu_device *adev,
 	if (!ras_if)
 		return 0;
 
-	ih_data.head = *ras_if;
-	amdgpu_ras_interrupt_dispatch(adev, &ih_data);
+	if (!amdgpu_sriov_vf(adev)) {
+		ih_data.head = *ras_if;
+		amdgpu_ras_interrupt_dispatch(adev, &ih_data);
+	} else {
+		if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
+			adev->virt.ops->ras_poison_handler(adev);
+		else
+			dev_warn(adev->dev,
+				"No ras_poison_handler interface in SRIOV for VCN!\n");
+	}
 
 	return 0;
 }
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 4/6] drm/amdgpu: skip RAS error injection in SRIOV
  2022-12-07 10:03 [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Tao Zhou
  2022-12-07 10:03 ` [PATCH 2/6] drm/amdgpu: add RAS poison consumption handler for NV SRIOV Tao Zhou
  2022-12-07 10:03 ` [PATCH 3/6] drm/amdgpu: add VCN poison consumption handler for SRIOV Tao Zhou
@ 2022-12-07 10:03 ` Tao Zhou
  2022-12-07 15:04   ` Zhang, Hawking
  2022-12-07 10:03 ` [PATCH 5/6] drm/amdgpu: update VCN/JPEG RAS setting Tao Zhou
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 10+ messages in thread
From: Tao Zhou @ 2022-12-07 10:03 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, Gavin.Wan, Vignesh.Chander,
	david.yu
  Cc: Tao Zhou

And return successful status.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ad490c1e2f57..854cff9e7ebd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1103,15 +1103,24 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 							  block_info.address);
 	}
 
-	if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
-		if (block_obj->hw_ops->ras_error_inject)
-			ret = block_obj->hw_ops->ras_error_inject(adev, info);
+	if (amdgpu_sriov_vf(adev)) {
+		dev_info(adev->dev, "RAS injection is skipped in SRIOV\n");
+		ret = 0;
 	} else {
-		/* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */
-		if (block_obj->hw_ops->ras_error_inject)
-			ret = block_obj->hw_ops->ras_error_inject(adev, &block_info);
-		else  /*If not defined .ras_error_inject, use default ras_error_inject*/
-			ret = psp_ras_trigger_error(&adev->psp, &block_info);
+		if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
+			if (block_obj->hw_ops->ras_error_inject)
+				ret = block_obj->hw_ops->ras_error_inject(adev, info);
+		} else {
+			/* If defined special ras_error_inject(e.g: xgmi),
+			 * implement special ras_error_inject
+			 */
+			if (block_obj->hw_ops->ras_error_inject)
+				ret = block_obj->hw_ops->ras_error_inject(adev, &block_info);
+			else   /* If not defined .ras_error_inject, use default
+				* ras_error_inject
+				*/
+				ret = psp_ras_trigger_error(&adev->psp, &block_info);
+		}
 	}
 
 	if (ret)
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 5/6] drm/amdgpu: update VCN/JPEG RAS setting
  2022-12-07 10:03 [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Tao Zhou
                   ` (2 preceding siblings ...)
  2022-12-07 10:03 ` [PATCH 4/6] drm/amdgpu: skip RAS error injection in SRIOV Tao Zhou
@ 2022-12-07 10:03 ` Tao Zhou
  2022-12-07 15:19   ` Zhang, Hawking
  2022-12-07 10:03 ` [PATCH 6/6] drm/amdgpu: define RAS poison mode query function Tao Zhou
  2022-12-07 15:25 ` [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Zhang, Hawking
  5 siblings, 1 reply; 10+ messages in thread
From: Tao Zhou @ 2022-12-07 10:03 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, Gavin.Wan, Vignesh.Chander,
	david.yu
  Cc: Tao Zhou

The enablement of VCN/JPEG RAS is unrelated to SRIOV.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 854cff9e7ebd..20474708bc7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2353,22 +2353,22 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
 
 		if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
 			dev_info(adev->dev, "SRAM ECC is active.\n");
-			if (!amdgpu_sriov_vf(adev)) {
+			if (!amdgpu_sriov_vf(adev))
 				adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
 							    1 << AMDGPU_RAS_BLOCK__DF);
-
-				if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) ||
-				    adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0))
-					adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
-							1 << AMDGPU_RAS_BLOCK__JPEG);
-				else
-					adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
-							1 << AMDGPU_RAS_BLOCK__JPEG);
-			} else {
+			else
 				adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF |
 								1 << AMDGPU_RAS_BLOCK__SDMA |
 								1 << AMDGPU_RAS_BLOCK__GFX);
-			}
+
+			/* VCN/JPEG RAS setting is unrelated to SRIOV */
+			if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) ||
+			    adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0))
+				adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
+							1 << AMDGPU_RAS_BLOCK__JPEG);
+			else
+				adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
+							1 << AMDGPU_RAS_BLOCK__JPEG);
 		} else {
 			dev_info(adev->dev, "SRAM ECC is not presented.\n");
 		}
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 6/6] drm/amdgpu: define RAS poison mode query function
  2022-12-07 10:03 [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Tao Zhou
                   ` (3 preceding siblings ...)
  2022-12-07 10:03 ` [PATCH 5/6] drm/amdgpu: update VCN/JPEG RAS setting Tao Zhou
@ 2022-12-07 10:03 ` Tao Zhou
  2022-12-07 15:26   ` Zhang, Hawking
  2022-12-07 15:25 ` [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Zhang, Hawking
  5 siblings, 1 reply; 10+ messages in thread
From: Tao Zhou @ 2022-12-07 10:03 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, stanley.yang, Gavin.Wan, Vignesh.Chander,
	david.yu
  Cc: Tao Zhou

1. no need to query poison mode on SRIOV guest side, host can handle it.
2. define the function to simplify code.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 54 +++++++++++++++----------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 20474708bc7d..2a5f23316f83 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2414,11 +2414,42 @@ static void amdgpu_ras_counte_dw(struct work_struct *work)
 	pm_runtime_put_autosuspend(dev->dev);
 }
 
+static void amdgpu_ras_poison_mode_query(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	bool df_poison, umc_poison;
+
+	/* poison setting is useless on SRIOV guest */
+	if (amdgpu_sriov_vf(adev) || !con)
+		return;
+
+	/* Init poison supported flag, the default value is false */
+	if (adev->gmc.xgmi.connected_to_cpu) {
+		/* enabled by default when GPU is connected to CPU */
+		con->poison_supported = true;
+	} else if (adev->df.funcs &&
+	    adev->df.funcs->query_ras_poison_mode &&
+	    adev->umc.ras &&
+	    adev->umc.ras->query_ras_poison_mode) {
+		df_poison =
+			adev->df.funcs->query_ras_poison_mode(adev);
+		umc_poison =
+			adev->umc.ras->query_ras_poison_mode(adev);
+
+		/* Only poison is set in both DF and UMC, we can support it */
+		if (df_poison && umc_poison)
+			con->poison_supported = true;
+		else if (df_poison != umc_poison)
+			dev_warn(adev->dev,
+				"Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
+				df_poison, umc_poison);
+	}
+}
+
 int amdgpu_ras_init(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	int r;
-	bool df_poison, umc_poison;
 
 	if (con)
 		return 0;
@@ -2493,26 +2524,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 			goto release_con;
 	}
 
-	/* Init poison supported flag, the default value is false */
-	if (adev->gmc.xgmi.connected_to_cpu) {
-		/* enabled by default when GPU is connected to CPU */
-		con->poison_supported = true;
-	}
-	else if (adev->df.funcs &&
-	    adev->df.funcs->query_ras_poison_mode &&
-	    adev->umc.ras &&
-	    adev->umc.ras->query_ras_poison_mode) {
-		df_poison =
-			adev->df.funcs->query_ras_poison_mode(adev);
-		umc_poison =
-			adev->umc.ras->query_ras_poison_mode(adev);
-		/* Only poison is set in both DF and UMC, we can support it */
-		if (df_poison && umc_poison)
-			con->poison_supported = true;
-		else if (df_poison != umc_poison)
-			dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
-					df_poison, umc_poison);
-	}
+	amdgpu_ras_poison_mode_query(adev);
 
 	if (amdgpu_ras_fs_init(adev)) {
 		r = -EINVAL;
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* RE: [PATCH 4/6] drm/amdgpu: skip RAS error injection in SRIOV
  2022-12-07 10:03 ` [PATCH 4/6] drm/amdgpu: skip RAS error injection in SRIOV Tao Zhou
@ 2022-12-07 15:04   ` Zhang, Hawking
  0 siblings, 0 replies; 10+ messages in thread
From: Zhang, Hawking @ 2022-12-07 15:04 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Yang, Stanley, Wan, Gavin, Chander, Vignesh,
	Yu, David

[AMD Official Use Only - General]

It might be better check amdgpu_sriov_vf from the beginning of the function. Return 0 directly if it is invoked from guest side. Don't need to print out something, error injection from guest is invalid.

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, December 7, 2022 18:04
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Wan, Gavin <Gavin.Wan@amd.com>; Chander, Vignesh <Vignesh.Chander@amd.com>; Yu, David <David.Yu@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 4/6] drm/amdgpu: skip RAS error injection in SRIOV

And return successful status.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ad490c1e2f57..854cff9e7ebd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1103,15 +1103,24 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
                                                          block_info.address);
        }

-       if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
-               if (block_obj->hw_ops->ras_error_inject)
-                       ret = block_obj->hw_ops->ras_error_inject(adev, info);
+       if (amdgpu_sriov_vf(adev)) {
+               dev_info(adev->dev, "RAS injection is skipped in SRIOV\n");
+               ret = 0;
        } else {
-               /* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */
-               if (block_obj->hw_ops->ras_error_inject)
-                       ret = block_obj->hw_ops->ras_error_inject(adev, &block_info);
-               else  /*If not defined .ras_error_inject, use default ras_error_inject*/
-                       ret = psp_ras_trigger_error(&adev->psp, &block_info);
+               if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
+                       if (block_obj->hw_ops->ras_error_inject)
+                               ret = block_obj->hw_ops->ras_error_inject(adev, info);
+               } else {
+                       /* If defined special ras_error_inject(e.g: xgmi),
+                        * implement special ras_error_inject
+                        */
+                       if (block_obj->hw_ops->ras_error_inject)
+                               ret = block_obj->hw_ops->ras_error_inject(adev, &block_info);
+                       else   /* If not defined .ras_error_inject, use default
+                               * ras_error_inject
+                               */
+                               ret = psp_ras_trigger_error(&adev->psp, &block_info);
+               }
        }

        if (ret)
--
2.35.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* RE: [PATCH 5/6] drm/amdgpu: update VCN/JPEG RAS setting
  2022-12-07 10:03 ` [PATCH 5/6] drm/amdgpu: update VCN/JPEG RAS setting Tao Zhou
@ 2022-12-07 15:19   ` Zhang, Hawking
  0 siblings, 0 replies; 10+ messages in thread
From: Zhang, Hawking @ 2022-12-07 15:19 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Yang, Stanley, Wan, Gavin, Chander, Vignesh,
	Yu, David

[AMD Official Use Only - General]

the commit description and the inline comments are confusing. I would say we support VCN RAS in both bare-metal and SRIOV environment now.

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, December 7, 2022 18:04
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Wan, Gavin <Gavin.Wan@amd.com>; Chander, Vignesh <Vignesh.Chander@amd.com>; Yu, David <David.Yu@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 5/6] drm/amdgpu: update VCN/JPEG RAS setting

The enablement of VCN/JPEG RAS is unrelated to SRIOV.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 854cff9e7ebd..20474708bc7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2353,22 +2353,22 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)

                if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
                        dev_info(adev->dev, "SRAM ECC is active.\n");
-                       if (!amdgpu_sriov_vf(adev)) {
+                       if (!amdgpu_sriov_vf(adev))
                                adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
                                                            1 << AMDGPU_RAS_BLOCK__DF);
-
-                               if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) ||
-                                   adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0))
-                                       adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
-                                                       1 << AMDGPU_RAS_BLOCK__JPEG);
-                               else
-                                       adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
-                                                       1 << AMDGPU_RAS_BLOCK__JPEG);
-                       } else {
+                       else
                                adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF |
                                                                1 << AMDGPU_RAS_BLOCK__SDMA |
                                                                1 << AMDGPU_RAS_BLOCK__GFX);
-                       }
+
+                       /* VCN/JPEG RAS setting is unrelated to SRIOV */
+                       if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) ||
+                           adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0))
+                               adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
+                                                       1 << AMDGPU_RAS_BLOCK__JPEG);
+                       else
+                               adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
+                                                       1 << AMDGPU_RAS_BLOCK__JPEG);
                } else {
                        dev_info(adev->dev, "SRAM ECC is not presented.\n");
                }
--
2.35.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* RE: [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt
  2022-12-07 10:03 [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Tao Zhou
                   ` (4 preceding siblings ...)
  2022-12-07 10:03 ` [PATCH 6/6] drm/amdgpu: define RAS poison mode query function Tao Zhou
@ 2022-12-07 15:25 ` Zhang, Hawking
  5 siblings, 0 replies; 10+ messages in thread
From: Zhang, Hawking @ 2022-12-07 15:25 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Yang, Stanley, Wan, Gavin, Chander, Vignesh,
	Yu, David

[AMD Official Use Only - General]

I suggest split the patch into two

One is adding ras poison handler for mxgpu ai products, similar as patch #2, including

drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  1 +
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c    |  6 ++++
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h    |  1 +

The other is adding common umc poison handling path for sriov, including

drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c  | 44 ++++++++++++++----------
Regards,
Hawking
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, December 7, 2022 18:04
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Wan, Gavin <Gavin.Wan@amd.com>; Chander, Vignesh <Vignesh.Chander@amd.com>; Yu, David <David.Yu@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt

PF will do page retirement, reset VF and inform VF to reserve RAS bad pages.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c  | 44 ++++++++++++++----------  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c    |  6 ++++
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h    |  1 +
 4 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index f76c19fc0392..1c7fcb4f2380 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -169,25 +169,33 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)  {
        int ret = AMDGPU_RAS_SUCCESS;

-       if (!adev->gmc.xgmi.connected_to_cpu) {
-               struct ras_err_data err_data = {0, 0, 0, NULL};
-               struct ras_common_if head = {
-                       .block = AMDGPU_RAS_BLOCK__UMC,
-               };
-               struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
-
-               ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
-
-               if (ret == AMDGPU_RAS_SUCCESS && obj) {
-                       obj->err_data.ue_count += err_data.ue_count;
-                       obj->err_data.ce_count += err_data.ce_count;
+       if (!amdgpu_sriov_vf(adev)) {
+               if (!adev->gmc.xgmi.connected_to_cpu) {
+                       struct ras_err_data err_data = {0, 0, 0, NULL};
+                       struct ras_common_if head = {
+                               .block = AMDGPU_RAS_BLOCK__UMC,
+                       };
+                       struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+
+                       ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
+
+                       if (ret == AMDGPU_RAS_SUCCESS && obj) {
+                               obj->err_data.ue_count += err_data.ue_count;
+                               obj->err_data.ce_count += err_data.ce_count;
+                       }
+               } else if (reset) {
+                       /* MCA poison handler is only responsible for GPU reset,
+                        * let MCA notifier do page retirement.
+                        */
+                       kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+                       amdgpu_ras_reset_gpu(adev);
                }
-       } else if (reset) {
-               /* MCA poison handler is only responsible for GPU reset,
-                * let MCA notifier do page retirement.
-                */
-               kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
-               amdgpu_ras_reset_gpu(adev);
+       } else {
+               if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
+                       adev->virt.ops->ras_poison_handler(adev);
+               else
+                       dev_warn(adev->dev,
+                               "No ras_poison_handler interface in SRIOV!\n");
        }

        return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 2b9d806e23af..b9e9480448af 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -88,6 +88,7 @@ struct amdgpu_virt_ops {
        int (*wait_reset)(struct amdgpu_device *adev);
        void (*trans_msg)(struct amdgpu_device *adev, enum idh_request req,
                          u32 data1, u32 data2, u32 data3);
+       void (*ras_poison_handler)(struct amdgpu_device *adev);
 };

 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 12906ba74462..63725b2ebc03 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -404,6 +404,11 @@ static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
        return xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_INIT_DATA);  }

+static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev) {
+       xgpu_ai_send_access_requests(adev, IDH_RAS_POISON); }
+
 const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
        .req_full_gpu   = xgpu_ai_request_full_gpu_access,
        .rel_full_gpu   = xgpu_ai_release_full_gpu_access,
@@ -411,4 +416,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
        .wait_reset = NULL,
        .trans_msg = xgpu_ai_mailbox_trans_msg,
        .req_init_data  = xgpu_ai_request_init_data,
+       .ras_poison_handler = xgpu_ai_ras_poison_handler,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
index fa7e13e0459e..0136bd059f68 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
@@ -39,6 +39,7 @@ enum idh_request {

        IDH_LOG_VF_ERROR       = 200,
        IDH_READY_TO_RESET      = 201,
+       IDH_RAS_POISON          = 202,
 };

 enum idh_event {
--
2.35.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* RE: [PATCH 6/6] drm/amdgpu: define RAS poison mode query function
  2022-12-07 10:03 ` [PATCH 6/6] drm/amdgpu: define RAS poison mode query function Tao Zhou
@ 2022-12-07 15:26   ` Zhang, Hawking
  0 siblings, 0 replies; 10+ messages in thread
From: Zhang, Hawking @ 2022-12-07 15:26 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Yang, Stanley, Wan, Gavin, Chander, Vignesh,
	Yu, David

[AMD Official Use Only - General]

Might be better rename to amdgdpu_ras_query_poison_mode to align with naming style of ip callbacks.

+static void amdgpu_ras_poison_mode_query

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, December 7, 2022 18:04
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Wan, Gavin <Gavin.Wan@amd.com>; Chander, Vignesh <Vignesh.Chander@amd.com>; Yu, David <David.Yu@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 6/6] drm/amdgpu: define RAS poison mode query function

1. no need to query poison mode on SRIOV guest side, host can handle it.
2. define the function to simplify code.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 54 +++++++++++++++----------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 20474708bc7d..2a5f23316f83 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2414,11 +2414,42 @@ static void amdgpu_ras_counte_dw(struct work_struct *work)
        pm_runtime_put_autosuspend(dev->dev);
 }

+static void amdgpu_ras_poison_mode_query(struct amdgpu_device *adev) {
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       bool df_poison, umc_poison;
+
+       /* poison setting is useless on SRIOV guest */
+       if (amdgpu_sriov_vf(adev) || !con)
+               return;
+
+       /* Init poison supported flag, the default value is false */
+       if (adev->gmc.xgmi.connected_to_cpu) {
+               /* enabled by default when GPU is connected to CPU */
+               con->poison_supported = true;
+       } else if (adev->df.funcs &&
+           adev->df.funcs->query_ras_poison_mode &&
+           adev->umc.ras &&
+           adev->umc.ras->query_ras_poison_mode) {
+               df_poison =
+                       adev->df.funcs->query_ras_poison_mode(adev);
+               umc_poison =
+                       adev->umc.ras->query_ras_poison_mode(adev);
+
+               /* Only poison is set in both DF and UMC, we can support it */
+               if (df_poison && umc_poison)
+                       con->poison_supported = true;
+               else if (df_poison != umc_poison)
+                       dev_warn(adev->dev,
+                               "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
+                               df_poison, umc_poison);
+       }
+}
+
 int amdgpu_ras_init(struct amdgpu_device *adev)  {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        int r;
-       bool df_poison, umc_poison;

        if (con)
                return 0;
@@ -2493,26 +2524,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
                        goto release_con;
        }

-       /* Init poison supported flag, the default value is false */
-       if (adev->gmc.xgmi.connected_to_cpu) {
-               /* enabled by default when GPU is connected to CPU */
-               con->poison_supported = true;
-       }
-       else if (adev->df.funcs &&
-           adev->df.funcs->query_ras_poison_mode &&
-           adev->umc.ras &&
-           adev->umc.ras->query_ras_poison_mode) {
-               df_poison =
-                       adev->df.funcs->query_ras_poison_mode(adev);
-               umc_poison =
-                       adev->umc.ras->query_ras_poison_mode(adev);
-               /* Only poison is set in both DF and UMC, we can support it */
-               if (df_poison && umc_poison)
-                       con->poison_supported = true;
-               else if (df_poison != umc_poison)
-                       dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
-                                       df_poison, umc_poison);
-       }
+       amdgpu_ras_poison_mode_query(adev);

        if (amdgpu_ras_fs_init(adev)) {
                r = -EINVAL;
--
2.35.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2022-12-07 15:26 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-07 10:03 [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Tao Zhou
2022-12-07 10:03 ` [PATCH 2/6] drm/amdgpu: add RAS poison consumption handler for NV SRIOV Tao Zhou
2022-12-07 10:03 ` [PATCH 3/6] drm/amdgpu: add VCN poison consumption handler for SRIOV Tao Zhou
2022-12-07 10:03 ` [PATCH 4/6] drm/amdgpu: skip RAS error injection in SRIOV Tao Zhou
2022-12-07 15:04   ` Zhang, Hawking
2022-12-07 10:03 ` [PATCH 5/6] drm/amdgpu: update VCN/JPEG RAS setting Tao Zhou
2022-12-07 15:19   ` Zhang, Hawking
2022-12-07 10:03 ` [PATCH 6/6] drm/amdgpu: define RAS poison mode query function Tao Zhou
2022-12-07 15:26   ` Zhang, Hawking
2022-12-07 15:25 ` [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Zhang, Hawking

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.