* [PATCH 2/6] drm/amdgpu: add RAS poison consumption handler for NV SRIOV
2022-12-07 10:03 [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Tao Zhou
@ 2022-12-07 10:03 ` Tao Zhou
2022-12-07 10:03 ` [PATCH 3/6] drm/amdgpu: add VCN poison consumption handler for SRIOV Tao Zhou
` (4 subsequent siblings)
5 siblings, 0 replies; 10+ messages in thread
From: Tao Zhou @ 2022-12-07 10:03 UTC (permalink / raw)
To: amd-gfx, hawking.zhang, stanley.yang, Gavin.Wan, Vignesh.Chander,
david.yu
Cc: Tao Zhou
Send handling request to host.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 6 ++++++
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h | 1 +
2 files changed, 7 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index e07757eea7ad..cae1aaa4ddb6 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -426,6 +426,11 @@ void xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
}
+static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev)
+{
+ xgpu_nv_send_access_requests(adev, IDH_RAS_POISON);
+}
+
const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
.req_full_gpu = xgpu_nv_request_full_gpu_access,
.rel_full_gpu = xgpu_nv_release_full_gpu_access,
@@ -433,4 +438,5 @@ const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
.reset_gpu = xgpu_nv_request_reset,
.wait_reset = NULL,
.trans_msg = xgpu_nv_mailbox_trans_msg,
+ .ras_poison_handler = xgpu_nv_ras_poison_handler,
};
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
index 73887b0aa1d6..0331d9c1a09b 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
@@ -39,6 +39,7 @@ enum idh_request {
IDH_LOG_VF_ERROR = 200,
IDH_READY_TO_RESET = 201,
+ IDH_RAS_POISON = 202,
};
enum idh_event {
--
2.35.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 3/6] drm/amdgpu: add VCN poison consumption handler for SRIOV
2022-12-07 10:03 [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Tao Zhou
2022-12-07 10:03 ` [PATCH 2/6] drm/amdgpu: add RAS poison consumption handler for NV SRIOV Tao Zhou
@ 2022-12-07 10:03 ` Tao Zhou
2022-12-07 10:03 ` [PATCH 4/6] drm/amdgpu: skip RAS error injection in SRIOV Tao Zhou
` (3 subsequent siblings)
5 siblings, 0 replies; 10+ messages in thread
From: Tao Zhou @ 2022-12-07 10:03 UTC (permalink / raw)
To: amd-gfx, hawking.zhang, stanley.yang, Gavin.Wan, Vignesh.Chander,
david.yu
Cc: Tao Zhou
Inform host and let host handle consumption interrupt.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 72fa14ff862f..a23e26b272b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1246,8 +1246,16 @@ int amdgpu_vcn_process_poison_irq(struct amdgpu_device *adev,
if (!ras_if)
return 0;
- ih_data.head = *ras_if;
- amdgpu_ras_interrupt_dispatch(adev, &ih_data);
+ if (!amdgpu_sriov_vf(adev)) {
+ ih_data.head = *ras_if;
+ amdgpu_ras_interrupt_dispatch(adev, &ih_data);
+ } else {
+ if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
+ adev->virt.ops->ras_poison_handler(adev);
+ else
+ dev_warn(adev->dev,
+ "No ras_poison_handler interface in SRIOV for VCN!\n");
+ }
return 0;
}
--
2.35.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 4/6] drm/amdgpu: skip RAS error injection in SRIOV
2022-12-07 10:03 [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Tao Zhou
2022-12-07 10:03 ` [PATCH 2/6] drm/amdgpu: add RAS poison consumption handler for NV SRIOV Tao Zhou
2022-12-07 10:03 ` [PATCH 3/6] drm/amdgpu: add VCN poison consumption handler for SRIOV Tao Zhou
@ 2022-12-07 10:03 ` Tao Zhou
2022-12-07 15:04 ` Zhang, Hawking
2022-12-07 10:03 ` [PATCH 5/6] drm/amdgpu: update VCN/JPEG RAS setting Tao Zhou
` (2 subsequent siblings)
5 siblings, 1 reply; 10+ messages in thread
From: Tao Zhou @ 2022-12-07 10:03 UTC (permalink / raw)
To: amd-gfx, hawking.zhang, stanley.yang, Gavin.Wan, Vignesh.Chander,
david.yu
Cc: Tao Zhou
And return successful status.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 25 +++++++++++++++++--------
1 file changed, 17 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ad490c1e2f57..854cff9e7ebd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1103,15 +1103,24 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
block_info.address);
}
- if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
- if (block_obj->hw_ops->ras_error_inject)
- ret = block_obj->hw_ops->ras_error_inject(adev, info);
+ if (amdgpu_sriov_vf(adev)) {
+ dev_info(adev->dev, "RAS injection is skipped in SRIOV\n");
+ ret = 0;
} else {
- /* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */
- if (block_obj->hw_ops->ras_error_inject)
- ret = block_obj->hw_ops->ras_error_inject(adev, &block_info);
- else /*If not defined .ras_error_inject, use default ras_error_inject*/
- ret = psp_ras_trigger_error(&adev->psp, &block_info);
+ if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
+ if (block_obj->hw_ops->ras_error_inject)
+ ret = block_obj->hw_ops->ras_error_inject(adev, info);
+ } else {
+ /* If defined special ras_error_inject(e.g: xgmi),
+ * implement special ras_error_inject
+ */
+ if (block_obj->hw_ops->ras_error_inject)
+ ret = block_obj->hw_ops->ras_error_inject(adev, &block_info);
+ else /* If not defined .ras_error_inject, use default
+ * ras_error_inject
+ */
+ ret = psp_ras_trigger_error(&adev->psp, &block_info);
+ }
}
if (ret)
--
2.35.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* RE: [PATCH 4/6] drm/amdgpu: skip RAS error injection in SRIOV
2022-12-07 10:03 ` [PATCH 4/6] drm/amdgpu: skip RAS error injection in SRIOV Tao Zhou
@ 2022-12-07 15:04 ` Zhang, Hawking
0 siblings, 0 replies; 10+ messages in thread
From: Zhang, Hawking @ 2022-12-07 15:04 UTC (permalink / raw)
To: Zhou1, Tao, amd-gfx, Yang, Stanley, Wan, Gavin, Chander, Vignesh,
Yu, David
[AMD Official Use Only - General]
It might be better check amdgpu_sriov_vf from the beginning of the function. Return 0 directly if it is invoked from guest side. Don't need to print out something, error injection from guest is invalid.
Regards,
Hawking
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, December 7, 2022 18:04
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Wan, Gavin <Gavin.Wan@amd.com>; Chander, Vignesh <Vignesh.Chander@amd.com>; Yu, David <David.Yu@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 4/6] drm/amdgpu: skip RAS error injection in SRIOV
And return successful status.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 25 +++++++++++++++++--------
1 file changed, 17 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ad490c1e2f57..854cff9e7ebd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1103,15 +1103,24 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
block_info.address);
}
- if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
- if (block_obj->hw_ops->ras_error_inject)
- ret = block_obj->hw_ops->ras_error_inject(adev, info);
+ if (amdgpu_sriov_vf(adev)) {
+ dev_info(adev->dev, "RAS injection is skipped in SRIOV\n");
+ ret = 0;
} else {
- /* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */
- if (block_obj->hw_ops->ras_error_inject)
- ret = block_obj->hw_ops->ras_error_inject(adev, &block_info);
- else /*If not defined .ras_error_inject, use default ras_error_inject*/
- ret = psp_ras_trigger_error(&adev->psp, &block_info);
+ if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
+ if (block_obj->hw_ops->ras_error_inject)
+ ret = block_obj->hw_ops->ras_error_inject(adev, info);
+ } else {
+ /* If defined special ras_error_inject(e.g: xgmi),
+ * implement special ras_error_inject
+ */
+ if (block_obj->hw_ops->ras_error_inject)
+ ret = block_obj->hw_ops->ras_error_inject(adev, &block_info);
+ else /* If not defined .ras_error_inject, use default
+ * ras_error_inject
+ */
+ ret = psp_ras_trigger_error(&adev->psp, &block_info);
+ }
}
if (ret)
--
2.35.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 5/6] drm/amdgpu: update VCN/JPEG RAS setting
2022-12-07 10:03 [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Tao Zhou
` (2 preceding siblings ...)
2022-12-07 10:03 ` [PATCH 4/6] drm/amdgpu: skip RAS error injection in SRIOV Tao Zhou
@ 2022-12-07 10:03 ` Tao Zhou
2022-12-07 15:19 ` Zhang, Hawking
2022-12-07 10:03 ` [PATCH 6/6] drm/amdgpu: define RAS poison mode query function Tao Zhou
2022-12-07 15:25 ` [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Zhang, Hawking
5 siblings, 1 reply; 10+ messages in thread
From: Tao Zhou @ 2022-12-07 10:03 UTC (permalink / raw)
To: amd-gfx, hawking.zhang, stanley.yang, Gavin.Wan, Vignesh.Chander,
david.yu
Cc: Tao Zhou
The enablement of VCN/JPEG RAS is unrelated to SRIOV.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 854cff9e7ebd..20474708bc7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2353,22 +2353,22 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
dev_info(adev->dev, "SRAM ECC is active.\n");
- if (!amdgpu_sriov_vf(adev)) {
+ if (!amdgpu_sriov_vf(adev))
adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
1 << AMDGPU_RAS_BLOCK__DF);
-
- if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) ||
- adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0))
- adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
- 1 << AMDGPU_RAS_BLOCK__JPEG);
- else
- adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
- 1 << AMDGPU_RAS_BLOCK__JPEG);
- } else {
+ else
adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF |
1 << AMDGPU_RAS_BLOCK__SDMA |
1 << AMDGPU_RAS_BLOCK__GFX);
- }
+
+ /* VCN/JPEG RAS setting is unrelated to SRIOV */
+ if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) ||
+ adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0))
+ adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
+ 1 << AMDGPU_RAS_BLOCK__JPEG);
+ else
+ adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
+ 1 << AMDGPU_RAS_BLOCK__JPEG);
} else {
dev_info(adev->dev, "SRAM ECC is not presented.\n");
}
--
2.35.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* RE: [PATCH 5/6] drm/amdgpu: update VCN/JPEG RAS setting
2022-12-07 10:03 ` [PATCH 5/6] drm/amdgpu: update VCN/JPEG RAS setting Tao Zhou
@ 2022-12-07 15:19 ` Zhang, Hawking
0 siblings, 0 replies; 10+ messages in thread
From: Zhang, Hawking @ 2022-12-07 15:19 UTC (permalink / raw)
To: Zhou1, Tao, amd-gfx, Yang, Stanley, Wan, Gavin, Chander, Vignesh,
Yu, David
[AMD Official Use Only - General]
the commit description and the inline comments are confusing. I would say we support VCN RAS in both bare-metal and SRIOV environment now.
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, December 7, 2022 18:04
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Wan, Gavin <Gavin.Wan@amd.com>; Chander, Vignesh <Vignesh.Chander@amd.com>; Yu, David <David.Yu@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 5/6] drm/amdgpu: update VCN/JPEG RAS setting
The enablement of VCN/JPEG RAS is unrelated to SRIOV.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 854cff9e7ebd..20474708bc7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2353,22 +2353,22 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
dev_info(adev->dev, "SRAM ECC is active.\n");
- if (!amdgpu_sriov_vf(adev)) {
+ if (!amdgpu_sriov_vf(adev))
adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
1 << AMDGPU_RAS_BLOCK__DF);
-
- if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) ||
- adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0))
- adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
- 1 << AMDGPU_RAS_BLOCK__JPEG);
- else
- adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
- 1 << AMDGPU_RAS_BLOCK__JPEG);
- } else {
+ else
adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF |
1 << AMDGPU_RAS_BLOCK__SDMA |
1 << AMDGPU_RAS_BLOCK__GFX);
- }
+
+ /* VCN/JPEG RAS setting is unrelated to SRIOV */
+ if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) ||
+ adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0))
+ adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
+ 1 << AMDGPU_RAS_BLOCK__JPEG);
+ else
+ adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
+ 1 << AMDGPU_RAS_BLOCK__JPEG);
} else {
dev_info(adev->dev, "SRAM ECC is not presented.\n");
}
--
2.35.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 6/6] drm/amdgpu: define RAS poison mode query function
2022-12-07 10:03 [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Tao Zhou
` (3 preceding siblings ...)
2022-12-07 10:03 ` [PATCH 5/6] drm/amdgpu: update VCN/JPEG RAS setting Tao Zhou
@ 2022-12-07 10:03 ` Tao Zhou
2022-12-07 15:26 ` Zhang, Hawking
2022-12-07 15:25 ` [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Zhang, Hawking
5 siblings, 1 reply; 10+ messages in thread
From: Tao Zhou @ 2022-12-07 10:03 UTC (permalink / raw)
To: amd-gfx, hawking.zhang, stanley.yang, Gavin.Wan, Vignesh.Chander,
david.yu
Cc: Tao Zhou
1. no need to query poison mode on SRIOV guest side, host can handle it.
2. define the function to simplify code.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 54 +++++++++++++++----------
1 file changed, 33 insertions(+), 21 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 20474708bc7d..2a5f23316f83 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2414,11 +2414,42 @@ static void amdgpu_ras_counte_dw(struct work_struct *work)
pm_runtime_put_autosuspend(dev->dev);
}
+static void amdgpu_ras_poison_mode_query(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ bool df_poison, umc_poison;
+
+ /* poison setting is useless on SRIOV guest */
+ if (amdgpu_sriov_vf(adev) || !con)
+ return;
+
+ /* Init poison supported flag, the default value is false */
+ if (adev->gmc.xgmi.connected_to_cpu) {
+ /* enabled by default when GPU is connected to CPU */
+ con->poison_supported = true;
+ } else if (adev->df.funcs &&
+ adev->df.funcs->query_ras_poison_mode &&
+ adev->umc.ras &&
+ adev->umc.ras->query_ras_poison_mode) {
+ df_poison =
+ adev->df.funcs->query_ras_poison_mode(adev);
+ umc_poison =
+ adev->umc.ras->query_ras_poison_mode(adev);
+
+ /* Only poison is set in both DF and UMC, we can support it */
+ if (df_poison && umc_poison)
+ con->poison_supported = true;
+ else if (df_poison != umc_poison)
+ dev_warn(adev->dev,
+ "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
+ df_poison, umc_poison);
+ }
+}
+
int amdgpu_ras_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int r;
- bool df_poison, umc_poison;
if (con)
return 0;
@@ -2493,26 +2524,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
- /* Init poison supported flag, the default value is false */
- if (adev->gmc.xgmi.connected_to_cpu) {
- /* enabled by default when GPU is connected to CPU */
- con->poison_supported = true;
- }
- else if (adev->df.funcs &&
- adev->df.funcs->query_ras_poison_mode &&
- adev->umc.ras &&
- adev->umc.ras->query_ras_poison_mode) {
- df_poison =
- adev->df.funcs->query_ras_poison_mode(adev);
- umc_poison =
- adev->umc.ras->query_ras_poison_mode(adev);
- /* Only poison is set in both DF and UMC, we can support it */
- if (df_poison && umc_poison)
- con->poison_supported = true;
- else if (df_poison != umc_poison)
- dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
- df_poison, umc_poison);
- }
+ amdgpu_ras_poison_mode_query(adev);
if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
--
2.35.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* RE: [PATCH 6/6] drm/amdgpu: define RAS poison mode query function
2022-12-07 10:03 ` [PATCH 6/6] drm/amdgpu: define RAS poison mode query function Tao Zhou
@ 2022-12-07 15:26 ` Zhang, Hawking
0 siblings, 0 replies; 10+ messages in thread
From: Zhang, Hawking @ 2022-12-07 15:26 UTC (permalink / raw)
To: Zhou1, Tao, amd-gfx, Yang, Stanley, Wan, Gavin, Chander, Vignesh,
Yu, David
[AMD Official Use Only - General]
Might be better rename to amdgdpu_ras_query_poison_mode to align with naming style of ip callbacks.
+static void amdgpu_ras_poison_mode_query
Regards,
Hawking
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, December 7, 2022 18:04
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Wan, Gavin <Gavin.Wan@amd.com>; Chander, Vignesh <Vignesh.Chander@amd.com>; Yu, David <David.Yu@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 6/6] drm/amdgpu: define RAS poison mode query function
1. no need to query poison mode on SRIOV guest side, host can handle it.
2. define the function to simplify code.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 54 +++++++++++++++----------
1 file changed, 33 insertions(+), 21 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 20474708bc7d..2a5f23316f83 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2414,11 +2414,42 @@ static void amdgpu_ras_counte_dw(struct work_struct *work)
pm_runtime_put_autosuspend(dev->dev);
}
+static void amdgpu_ras_poison_mode_query(struct amdgpu_device *adev) {
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ bool df_poison, umc_poison;
+
+ /* poison setting is useless on SRIOV guest */
+ if (amdgpu_sriov_vf(adev) || !con)
+ return;
+
+ /* Init poison supported flag, the default value is false */
+ if (adev->gmc.xgmi.connected_to_cpu) {
+ /* enabled by default when GPU is connected to CPU */
+ con->poison_supported = true;
+ } else if (adev->df.funcs &&
+ adev->df.funcs->query_ras_poison_mode &&
+ adev->umc.ras &&
+ adev->umc.ras->query_ras_poison_mode) {
+ df_poison =
+ adev->df.funcs->query_ras_poison_mode(adev);
+ umc_poison =
+ adev->umc.ras->query_ras_poison_mode(adev);
+
+ /* Only poison is set in both DF and UMC, we can support it */
+ if (df_poison && umc_poison)
+ con->poison_supported = true;
+ else if (df_poison != umc_poison)
+ dev_warn(adev->dev,
+ "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
+ df_poison, umc_poison);
+ }
+}
+
int amdgpu_ras_init(struct amdgpu_device *adev) {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int r;
- bool df_poison, umc_poison;
if (con)
return 0;
@@ -2493,26 +2524,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
- /* Init poison supported flag, the default value is false */
- if (adev->gmc.xgmi.connected_to_cpu) {
- /* enabled by default when GPU is connected to CPU */
- con->poison_supported = true;
- }
- else if (adev->df.funcs &&
- adev->df.funcs->query_ras_poison_mode &&
- adev->umc.ras &&
- adev->umc.ras->query_ras_poison_mode) {
- df_poison =
- adev->df.funcs->query_ras_poison_mode(adev);
- umc_poison =
- adev->umc.ras->query_ras_poison_mode(adev);
- /* Only poison is set in both DF and UMC, we can support it */
- if (df_poison && umc_poison)
- con->poison_supported = true;
- else if (df_poison != umc_poison)
- dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
- df_poison, umc_poison);
- }
+ amdgpu_ras_poison_mode_query(adev);
if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
--
2.35.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* RE: [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt
2022-12-07 10:03 [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt Tao Zhou
` (4 preceding siblings ...)
2022-12-07 10:03 ` [PATCH 6/6] drm/amdgpu: define RAS poison mode query function Tao Zhou
@ 2022-12-07 15:25 ` Zhang, Hawking
5 siblings, 0 replies; 10+ messages in thread
From: Zhang, Hawking @ 2022-12-07 15:25 UTC (permalink / raw)
To: Zhou1, Tao, amd-gfx, Yang, Stanley, Wan, Gavin, Chander, Vignesh,
Yu, David
[AMD Official Use Only - General]
I suggest split the patch into two
One is adding ras poison handler for mxgpu ai products, similar as patch #2, including
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 +
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 6 ++++
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h | 1 +
The other is adding common umc poison handling path for sriov, including
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 44 ++++++++++++++----------
Regards,
Hawking
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, December 7, 2022 18:04
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Wan, Gavin <Gavin.Wan@amd.com>; Chander, Vignesh <Vignesh.Chander@amd.com>; Yu, David <David.Yu@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt
PF will do page retirement, reset VF and inform VF to reserve RAS bad pages.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 44 ++++++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 +
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 6 ++++
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h | 1 +
4 files changed, 34 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index f76c19fc0392..1c7fcb4f2380 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -169,25 +169,33 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset) {
int ret = AMDGPU_RAS_SUCCESS;
- if (!adev->gmc.xgmi.connected_to_cpu) {
- struct ras_err_data err_data = {0, 0, 0, NULL};
- struct ras_common_if head = {
- .block = AMDGPU_RAS_BLOCK__UMC,
- };
- struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
-
- ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
-
- if (ret == AMDGPU_RAS_SUCCESS && obj) {
- obj->err_data.ue_count += err_data.ue_count;
- obj->err_data.ce_count += err_data.ce_count;
+ if (!amdgpu_sriov_vf(adev)) {
+ if (!adev->gmc.xgmi.connected_to_cpu) {
+ struct ras_err_data err_data = {0, 0, 0, NULL};
+ struct ras_common_if head = {
+ .block = AMDGPU_RAS_BLOCK__UMC,
+ };
+ struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+
+ ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
+
+ if (ret == AMDGPU_RAS_SUCCESS && obj) {
+ obj->err_data.ue_count += err_data.ue_count;
+ obj->err_data.ce_count += err_data.ce_count;
+ }
+ } else if (reset) {
+ /* MCA poison handler is only responsible for GPU reset,
+ * let MCA notifier do page retirement.
+ */
+ kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+ amdgpu_ras_reset_gpu(adev);
}
- } else if (reset) {
- /* MCA poison handler is only responsible for GPU reset,
- * let MCA notifier do page retirement.
- */
- kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
- amdgpu_ras_reset_gpu(adev);
+ } else {
+ if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
+ adev->virt.ops->ras_poison_handler(adev);
+ else
+ dev_warn(adev->dev,
+ "No ras_poison_handler interface in SRIOV!\n");
}
return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 2b9d806e23af..b9e9480448af 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -88,6 +88,7 @@ struct amdgpu_virt_ops {
int (*wait_reset)(struct amdgpu_device *adev);
void (*trans_msg)(struct amdgpu_device *adev, enum idh_request req,
u32 data1, u32 data2, u32 data3);
+ void (*ras_poison_handler)(struct amdgpu_device *adev);
};
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 12906ba74462..63725b2ebc03 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -404,6 +404,11 @@ static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
return xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_INIT_DATA); }
+static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev) {
+ xgpu_ai_send_access_requests(adev, IDH_RAS_POISON); }
+
const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
.req_full_gpu = xgpu_ai_request_full_gpu_access,
.rel_full_gpu = xgpu_ai_release_full_gpu_access,
@@ -411,4 +416,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
.wait_reset = NULL,
.trans_msg = xgpu_ai_mailbox_trans_msg,
.req_init_data = xgpu_ai_request_init_data,
+ .ras_poison_handler = xgpu_ai_ras_poison_handler,
};
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
index fa7e13e0459e..0136bd059f68 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
@@ -39,6 +39,7 @@ enum idh_request {
IDH_LOG_VF_ERROR = 200,
IDH_READY_TO_RESET = 201,
+ IDH_RAS_POISON = 202,
};
enum idh_event {
--
2.35.1
^ permalink raw reply related [flat|nested] 10+ messages in thread