* [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption
@ 2022-03-16 9:26 Tao Zhou
2022-03-16 9:26 ` [PATCH 2/4] drm/amdkfd: replace source_id with client_id for RAS poison consumption Tao Zhou
` (4 more replies)
0 siblings, 5 replies; 11+ messages in thread
From: Tao Zhou @ 2022-03-16 9:26 UTC (permalink / raw)
To: amd-gfx, hawking.zhang, Felix.Kuehling, stanley.yang, yipeng.chai
Cc: Tao Zhou
Combine reading and setting poison flag as one atomic operation
and add print message for the function.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 7eedbcd14828..a992798ff8b6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -93,20 +93,19 @@ enum SQ_INTERRUPT_ERROR_TYPE {
static void event_interrupt_poison_consumption(struct kfd_dev *dev,
uint16_t pasid, uint16_t source_id)
{
- int ret = -EINVAL;
+ int old_poison, ret = -EINVAL;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
if (!p)
return;
/* all queues of a process will be unmapped in one time */
- if (atomic_read(&p->poison)) {
- kfd_unref_process(p);
+ old_poison = atomic_cmpxchg(&p->poison, 0, 1);
+ kfd_unref_process(p);
+ if (old_poison)
return;
- }
- atomic_set(&p->poison, 1);
- kfd_unref_process(p);
+ pr_warn("RAS poison consumption handling\n");
switch (source_id) {
case SOC15_INTSRC_SQ_INTERRUPT_MSG:
--
2.35.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 2/4] drm/amdkfd: replace source_id with client_id for RAS poison consumption
2022-03-16 9:26 [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Tao Zhou
@ 2022-03-16 9:26 ` Tao Zhou
2022-03-16 13:52 ` Zhang, Hawking
2022-03-16 9:26 ` [PATCH 3/4] drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2) Tao Zhou
` (3 subsequent siblings)
4 siblings, 1 reply; 11+ messages in thread
From: Tao Zhou @ 2022-03-16 9:26 UTC (permalink / raw)
To: amd-gfx, hawking.zhang, Felix.Kuehling, stanley.yang, yipeng.chai
Cc: Tao Zhou
client_id is more accruate here and we can deal with more different
cases.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
.../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 25 ++++++++++++++-----
1 file changed, 19 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index a992798ff8b6..e5f03f79546f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -91,7 +91,7 @@ enum SQ_INTERRUPT_ERROR_TYPE {
#define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
static void event_interrupt_poison_consumption(struct kfd_dev *dev,
- uint16_t pasid, uint16_t source_id)
+ uint16_t pasid, uint16_t client_id)
{
int old_poison, ret = -EINVAL;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
@@ -107,11 +107,24 @@ static void event_interrupt_poison_consumption(struct kfd_dev *dev,
pr_warn("RAS poison consumption handling\n");
- switch (source_id) {
- case SOC15_INTSRC_SQ_INTERRUPT_MSG:
+ switch (client_id) {
+ case SOC15_IH_CLIENTID_GRBM_CP:
+ case SOC15_IH_CLIENTID_SE0SH:
+ case SOC15_IH_CLIENTID_SE1SH:
+ case SOC15_IH_CLIENTID_SE2SH:
+ case SOC15_IH_CLIENTID_SE3SH:
+ case SOC15_IH_CLIENTID_UTCL2:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
break;
- case SOC15_INTSRC_SDMA_ECC:
+ case SOC15_IH_CLIENTID_SDMA0:
+ case SOC15_IH_CLIENTID_SDMA1:
+ case SOC15_IH_CLIENTID_SDMA2:
+ case SOC15_IH_CLIENTID_SDMA3:
+ case SOC15_IH_CLIENTID_SDMA4:
+ case SOC15_IH_CLIENTID_SDMA5:
+ case SOC15_IH_CLIENTID_SDMA6:
+ case SOC15_IH_CLIENTID_SDMA7:
+ break;
default:
break;
}
@@ -269,7 +282,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
sq_intr_err);
if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
- event_interrupt_poison_consumption(dev, pasid, source_id);
+ event_interrupt_poison_consumption(dev, pasid, client_id);
return;
}
break;
@@ -290,7 +303,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
if (source_id == SOC15_INTSRC_SDMA_TRAP) {
kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
- event_interrupt_poison_consumption(dev, pasid, source_id);
+ event_interrupt_poison_consumption(dev, pasid, client_id);
return;
}
} else if (client_id == SOC15_IH_CLIENTID_VMC ||
--
2.35.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 3/4] drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2)
2022-03-16 9:26 [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Tao Zhou
2022-03-16 9:26 ` [PATCH 2/4] drm/amdkfd: replace source_id with client_id for RAS poison consumption Tao Zhou
@ 2022-03-16 9:26 ` Tao Zhou
2022-03-16 13:54 ` Zhang, Hawking
2022-03-16 9:26 ` [PATCH 4/4] drm/amdkfd: add RAS poison consumption handling for UTCL2 (v2) Tao Zhou
` (2 subsequent siblings)
4 siblings, 1 reply; 11+ messages in thread
From: Tao Zhou @ 2022-03-16 9:26 UTC (permalink / raw)
To: amd-gfx, hawking.zhang, Felix.Kuehling, stanley.yang, yipeng.chai
Cc: Tao Zhou
Add help functions to query and reset RAS UTCL2 poison status.
v2: implement it on amdgpu side and kfd only calls it.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 8 ++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 1 +
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 14 ++++++++++++++
4 files changed, 24 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 6ca1db3c243f..c18c4be1e4ac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -724,3 +724,11 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo
else if (reset)
amdgpu_amdkfd_gpu_reset(adev);
}
+
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
+{
+ if (adev->gfx.ras->query_utcl2_poison_status)
+ return adev->gfx.ras->query_utcl2_poison_status(adev);
+ else
+ return false;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 4cb14c2fe53f..0838926a8ef0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -301,6 +301,7 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
void amdgpu_amdkfd_block_mmu_notifications(void *p);
int amdgpu_amdkfd_criu_resume(void *p);
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev);
#if IS_ENABLED(CONFIG_HSA_AMD)
void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index dcb3c7871c73..5ed9b8a4c571 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -202,6 +202,7 @@ struct amdgpu_cu_info {
struct amdgpu_gfx_ras {
struct amdgpu_ras_block_object ras_block;
void (*enable_watchdog_timer)(struct amdgpu_device *adev);
+ bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);
};
struct amdgpu_gfx_funcs {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
index 7653ebd0e67b..e0890e00eedf 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -1930,6 +1930,19 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev)
mutex_unlock(&adev->grbm_idx_mutex);
}
+static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device *adev)
+{
+ uint32_t status = 0;
+ struct amdgpu_vmhub *hub;
+
+ hub = &adev->vmhub[AMDGPU_GFXHUB_0];
+ status = RREG32(hub->vm_l2_pro_fault_status);
+ /* reset page fault status */
+ WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
+
+ return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+}
+
struct amdgpu_ras_block_hw_ops gfx_v9_4_2_ras_ops = {
.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
@@ -1943,4 +1956,5 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
.hw_ops = &gfx_v9_4_2_ras_ops,
},
.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
+ .query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status,
};
--
2.35.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 4/4] drm/amdkfd: add RAS poison consumption handling for UTCL2 (v2)
2022-03-16 9:26 [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Tao Zhou
2022-03-16 9:26 ` [PATCH 2/4] drm/amdkfd: replace source_id with client_id for RAS poison consumption Tao Zhou
2022-03-16 9:26 ` [PATCH 3/4] drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2) Tao Zhou
@ 2022-03-16 9:26 ` Tao Zhou
2022-03-16 13:56 ` Zhang, Hawking
2022-03-16 13:48 ` [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Zhang, Hawking
2022-03-16 14:04 ` Felix Kuehling
4 siblings, 1 reply; 11+ messages in thread
From: Tao Zhou @ 2022-03-16 9:26 UTC (permalink / raw)
To: amd-gfx, hawking.zhang, Felix.Kuehling, stanley.yang, yipeng.chai
Cc: Tao Zhou
Do RAS page retirement and use gpu reset as fallback in UTCL2 fault
handler.
v2: replace vm fault event with posion consumed event in UTCL2
poison consumption.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index e5f03f79546f..55ee062a8496 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -312,6 +312,12 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
struct kfd_vm_fault_info info = {0};
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+ if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
+ amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
+ event_interrupt_poison_consumption(dev, pasid, client_id);
+ return;
+ }
+
info.vmid = vmid;
info.mc_id = client_id;
info.page_addr = ih_ring_entry[4] |
--
2.35.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* RE: [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption
2022-03-16 9:26 [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Tao Zhou
` (2 preceding siblings ...)
2022-03-16 9:26 ` [PATCH 4/4] drm/amdkfd: add RAS poison consumption handling for UTCL2 (v2) Tao Zhou
@ 2022-03-16 13:48 ` Zhang, Hawking
2022-03-16 13:55 ` Zhang, Hawking
2022-03-16 14:04 ` Felix Kuehling
4 siblings, 1 reply; 11+ messages in thread
From: Zhang, Hawking @ 2022-03-16 13:48 UTC (permalink / raw)
To: Zhou1, Tao, amd-gfx, Kuehling, Felix, Yang, Stanley, Chai, Thomas
[AMD Official Use Only]
+ pr_warn("RAS poison consumption handling\n");
Given you already print out this, might be better add some more information to help identify which blocks issued this poison consumption interrupt. e.g. add client_id or source_id
Regards,
Hawking
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, March 16, 2022 17:26
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption
Combine reading and setting poison flag as one atomic operation and add print message for the function.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 7eedbcd14828..a992798ff8b6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -93,20 +93,19 @@ enum SQ_INTERRUPT_ERROR_TYPE { static void event_interrupt_poison_consumption(struct kfd_dev *dev,
uint16_t pasid, uint16_t source_id) {
- int ret = -EINVAL;
+ int old_poison, ret = -EINVAL;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
if (!p)
return;
/* all queues of a process will be unmapped in one time */
- if (atomic_read(&p->poison)) {
- kfd_unref_process(p);
+ old_poison = atomic_cmpxchg(&p->poison, 0, 1);
+ kfd_unref_process(p);
+ if (old_poison)
return;
- }
- atomic_set(&p->poison, 1);
- kfd_unref_process(p);
+ pr_warn("RAS poison consumption handling\n");
switch (source_id) {
case SOC15_INTSRC_SQ_INTERRUPT_MSG:
--
2.35.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* RE: [PATCH 2/4] drm/amdkfd: replace source_id with client_id for RAS poison consumption
2022-03-16 9:26 ` [PATCH 2/4] drm/amdkfd: replace source_id with client_id for RAS poison consumption Tao Zhou
@ 2022-03-16 13:52 ` Zhang, Hawking
0 siblings, 0 replies; 11+ messages in thread
From: Zhang, Hawking @ 2022-03-16 13:52 UTC (permalink / raw)
To: Zhou1, Tao, amd-gfx, Kuehling, Felix, Yang, Stanley, Chai, Thomas
[AMD Official Use Only]
+ case SOC15_IH_CLIENTID_SDMA5:
+ case SOC15_IH_CLIENTID_SDMA6:
+ case SOC15_IH_CLIENTID_SDMA7:
Please drop the unnecessary case here since the feature is not available on all the ASICs.
+ case SOC15_IH_CLIENTID_GRBM_CP:
+ case SOC15_IH_CLIENTID_SE0SH:
+ case SOC15_IH_CLIENTID_SE1SH:
+ case SOC15_IH_CLIENTID_SE2SH:
+ case SOC15_IH_CLIENTID_SE3SH:
And I also think we can reduce the cases for SQ interrupt as well
With above addressed, the patch is
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Regards,
Hawking
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, March 16, 2022 17:26
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 2/4] drm/amdkfd: replace source_id with client_id for RAS poison consumption
client_id is more accruate here and we can deal with more different cases.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
.../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 25 ++++++++++++++-----
1 file changed, 19 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index a992798ff8b6..e5f03f79546f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -91,7 +91,7 @@ enum SQ_INTERRUPT_ERROR_TYPE { #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
static void event_interrupt_poison_consumption(struct kfd_dev *dev,
- uint16_t pasid, uint16_t source_id)
+ uint16_t pasid, uint16_t client_id)
{
int old_poison, ret = -EINVAL;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
@@ -107,11 +107,24 @@ static void event_interrupt_poison_consumption(struct kfd_dev *dev,
pr_warn("RAS poison consumption handling\n");
- switch (source_id) {
- case SOC15_INTSRC_SQ_INTERRUPT_MSG:
+ switch (client_id) {
+ case SOC15_IH_CLIENTID_GRBM_CP:
+ case SOC15_IH_CLIENTID_SE0SH:
+ case SOC15_IH_CLIENTID_SE1SH:
+ case SOC15_IH_CLIENTID_SE2SH:
+ case SOC15_IH_CLIENTID_SE3SH:
+ case SOC15_IH_CLIENTID_UTCL2:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
break;
- case SOC15_INTSRC_SDMA_ECC:
+ case SOC15_IH_CLIENTID_SDMA0:
+ case SOC15_IH_CLIENTID_SDMA1:
+ case SOC15_IH_CLIENTID_SDMA2:
+ case SOC15_IH_CLIENTID_SDMA3:
+ case SOC15_IH_CLIENTID_SDMA4:
+ case SOC15_IH_CLIENTID_SDMA5:
+ case SOC15_IH_CLIENTID_SDMA6:
+ case SOC15_IH_CLIENTID_SDMA7:
+ break;
default:
break;
}
@@ -269,7 +282,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
sq_intr_err);
if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
- event_interrupt_poison_consumption(dev, pasid, source_id);
+ event_interrupt_poison_consumption(dev, pasid, client_id);
return;
}
break;
@@ -290,7 +303,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
if (source_id == SOC15_INTSRC_SDMA_TRAP) {
kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
- event_interrupt_poison_consumption(dev, pasid, source_id);
+ event_interrupt_poison_consumption(dev, pasid, client_id);
return;
}
} else if (client_id == SOC15_IH_CLIENTID_VMC ||
--
2.35.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* RE: [PATCH 3/4] drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2)
2022-03-16 9:26 ` [PATCH 3/4] drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2) Tao Zhou
@ 2022-03-16 13:54 ` Zhang, Hawking
0 siblings, 0 replies; 11+ messages in thread
From: Zhang, Hawking @ 2022-03-16 13:54 UTC (permalink / raw)
To: Zhou1, Tao, amd-gfx, Kuehling, Felix, Yang, Stanley, Chai, Thomas
[AMD Official Use Only]
V2 looks good to me
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Regards,
Hawking
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, March 16, 2022 17:26
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 3/4] drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2)
Add help functions to query and reset RAS UTCL2 poison status.
v2: implement it on amdgpu side and kfd only calls it.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 8 ++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 1 +
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 14 ++++++++++++++
4 files changed, 24 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 6ca1db3c243f..c18c4be1e4ac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -724,3 +724,11 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo
else if (reset)
amdgpu_amdkfd_gpu_reset(adev);
}
+
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device
+*adev) {
+ if (adev->gfx.ras->query_utcl2_poison_status)
+ return adev->gfx.ras->query_utcl2_poison_status(adev);
+ else
+ return false;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 4cb14c2fe53f..0838926a8ef0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -301,6 +301,7 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); void amdgpu_amdkfd_block_mmu_notifications(void *p); int amdgpu_amdkfd_criu_resume(void *p);
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device
+*adev);
#if IS_ENABLED(CONFIG_HSA_AMD)
void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index dcb3c7871c73..5ed9b8a4c571 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -202,6 +202,7 @@ struct amdgpu_cu_info { struct amdgpu_gfx_ras {
struct amdgpu_ras_block_object ras_block;
void (*enable_watchdog_timer)(struct amdgpu_device *adev);
+ bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);
};
struct amdgpu_gfx_funcs {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
index 7653ebd0e67b..e0890e00eedf 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -1930,6 +1930,19 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev)
mutex_unlock(&adev->grbm_idx_mutex);
}
+static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device
+*adev) {
+ uint32_t status = 0;
+ struct amdgpu_vmhub *hub;
+
+ hub = &adev->vmhub[AMDGPU_GFXHUB_0];
+ status = RREG32(hub->vm_l2_pro_fault_status);
+ /* reset page fault status */
+ WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
+
+ return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); }
+
struct amdgpu_ras_block_hw_ops gfx_v9_4_2_ras_ops = {
.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
@@ -1943,4 +1956,5 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
.hw_ops = &gfx_v9_4_2_ras_ops,
},
.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
+ .query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status,
};
--
2.35.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* RE: [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption
2022-03-16 13:48 ` [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Zhang, Hawking
@ 2022-03-16 13:55 ` Zhang, Hawking
0 siblings, 0 replies; 11+ messages in thread
From: Zhang, Hawking @ 2022-03-16 13:55 UTC (permalink / raw)
To: Zhang, Hawking, Zhou1, Tao, amd-gfx, Kuehling, Felix, Yang,
Stanley, Chai, Thomas
[AMD Official Use Only]
Hit send too quickly. With below addressed, the patch is
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Regards,
Hawking
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Zhang, Hawking
Sent: Wednesday, March 16, 2022 21:49
To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org; Kuehling, Felix <Felix.Kuehling@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Subject: RE: [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption
[AMD Official Use Only]
[AMD Official Use Only]
+ pr_warn("RAS poison consumption handling\n");
Given you already print out this, might be better add some more information to help identify which blocks issued this poison consumption interrupt. e.g. add client_id or source_id
Regards,
Hawking
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, March 16, 2022 17:26
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption
Combine reading and setting poison flag as one atomic operation and add print message for the function.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 7eedbcd14828..a992798ff8b6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -93,20 +93,19 @@ enum SQ_INTERRUPT_ERROR_TYPE { static void event_interrupt_poison_consumption(struct kfd_dev *dev,
uint16_t pasid, uint16_t source_id) {
- int ret = -EINVAL;
+ int old_poison, ret = -EINVAL;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
if (!p)
return;
/* all queues of a process will be unmapped in one time */
- if (atomic_read(&p->poison)) {
- kfd_unref_process(p);
+ old_poison = atomic_cmpxchg(&p->poison, 0, 1);
+ kfd_unref_process(p);
+ if (old_poison)
return;
- }
- atomic_set(&p->poison, 1);
- kfd_unref_process(p);
+ pr_warn("RAS poison consumption handling\n");
switch (source_id) {
case SOC15_INTSRC_SQ_INTERRUPT_MSG:
--
2.35.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* RE: [PATCH 4/4] drm/amdkfd: add RAS poison consumption handling for UTCL2 (v2)
2022-03-16 9:26 ` [PATCH 4/4] drm/amdkfd: add RAS poison consumption handling for UTCL2 (v2) Tao Zhou
@ 2022-03-16 13:56 ` Zhang, Hawking
0 siblings, 0 replies; 11+ messages in thread
From: Zhang, Hawking @ 2022-03-16 13:56 UTC (permalink / raw)
To: Zhou1, Tao, amd-gfx, Kuehling, Felix, Yang, Stanley, Chai, Thomas
[AMD Official Use Only]
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Regards,
Hawking
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, March 16, 2022 17:26
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 4/4] drm/amdkfd: add RAS poison consumption handling for UTCL2 (v2)
Do RAS page retirement and use gpu reset as fallback in UTCL2 fault handler.
v2: replace vm fault event with posion consumed event in UTCL2 poison consumption.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index e5f03f79546f..55ee062a8496 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -312,6 +312,12 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
struct kfd_vm_fault_info info = {0};
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+ if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
+ amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
+ event_interrupt_poison_consumption(dev, pasid, client_id);
+ return;
+ }
+
info.vmid = vmid;
info.mc_id = client_id;
info.page_addr = ih_ring_entry[4] |
--
2.35.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption
2022-03-16 9:26 [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Tao Zhou
` (3 preceding siblings ...)
2022-03-16 13:48 ` [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Zhang, Hawking
@ 2022-03-16 14:04 ` Felix Kuehling
2022-03-17 2:13 ` Zhou1, Tao
4 siblings, 1 reply; 11+ messages in thread
From: Felix Kuehling @ 2022-03-16 14:04 UTC (permalink / raw)
To: Tao Zhou, amd-gfx, hawking.zhang, stanley.yang, yipeng.chai
Am 2022-03-16 um 05:26 schrieb Tao Zhou:
> Combine reading and setting poison flag as one atomic operation
> and add print message for the function.
>
> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 11 +++++------
> 1 file changed, 5 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index 7eedbcd14828..a992798ff8b6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -93,20 +93,19 @@ enum SQ_INTERRUPT_ERROR_TYPE {
> static void event_interrupt_poison_consumption(struct kfd_dev *dev,
> uint16_t pasid, uint16_t source_id)
> {
> - int ret = -EINVAL;
> + int old_poison, ret = -EINVAL;
> struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
>
> if (!p)
> return;
>
> /* all queues of a process will be unmapped in one time */
> - if (atomic_read(&p->poison)) {
> - kfd_unref_process(p);
> + old_poison = atomic_cmpxchg(&p->poison, 0, 1);
> + kfd_unref_process(p);
> + if (old_poison)
> return;
> - }
>
> - atomic_set(&p->poison, 1);
> - kfd_unref_process(p);
> + pr_warn("RAS poison consumption handling\n");
If this left over from debugging? Or did you mean to add a warning
message here? Either way, the patch is
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>
> switch (source_id) {
> case SOC15_INTSRC_SQ_INTERRUPT_MSG:
^ permalink raw reply [flat|nested] 11+ messages in thread
* RE: [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption
2022-03-16 14:04 ` Felix Kuehling
@ 2022-03-17 2:13 ` Zhou1, Tao
0 siblings, 0 replies; 11+ messages in thread
From: Zhou1, Tao @ 2022-03-17 2:13 UTC (permalink / raw)
To: Kuehling, Felix, amd-gfx, Zhang, Hawking, Yang, Stanley, Chai, Thomas
[AMD Official Use Only]
> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling@amd.com>
> Sent: Wednesday, March 16, 2022 10:04 PM
> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org; Zhang,
> Hawking <Hawking.Zhang@amd.com>; Yang, Stanley
> <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
> Subject: Re: [PATCH 1/4] drm/amdkfd: refine
> event_interrupt_poison_consumption
>
> Am 2022-03-16 um 05:26 schrieb Tao Zhou:
> > Combine reading and setting poison flag as one atomic operation and
> > add print message for the function.
> >
> > Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> > ---
> > drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 11 +++++------
> > 1 file changed, 5 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> > index 7eedbcd14828..a992798ff8b6 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> > @@ -93,20 +93,19 @@ enum SQ_INTERRUPT_ERROR_TYPE {
> > static void event_interrupt_poison_consumption(struct kfd_dev *dev,
> > uint16_t pasid, uint16_t source_id)
> > {
> > - int ret = -EINVAL;
> > + int old_poison, ret = -EINVAL;
> > struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
> >
> > if (!p)
> > return;
> >
> > /* all queues of a process will be unmapped in one time */
> > - if (atomic_read(&p->poison)) {
> > - kfd_unref_process(p);
> > + old_poison = atomic_cmpxchg(&p->poison, 0, 1);
> > + kfd_unref_process(p);
> > + if (old_poison)
> > return;
> > - }
> >
> > - atomic_set(&p->poison, 1);
> > - kfd_unref_process(p);
> > + pr_warn("RAS poison consumption handling\n");
>
> If this left over from debugging? Or did you mean to add a warning message
> here? Either way, the patch is
Both are my intention, poison consumption will be executed quietly if everything goes well, the message is helpful for debug and QA's test.
>
> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>
>
> >
> > switch (source_id) {
> > case SOC15_INTSRC_SQ_INTERRUPT_MSG:
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2022-03-17 2:13 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-16 9:26 [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Tao Zhou
2022-03-16 9:26 ` [PATCH 2/4] drm/amdkfd: replace source_id with client_id for RAS poison consumption Tao Zhou
2022-03-16 13:52 ` Zhang, Hawking
2022-03-16 9:26 ` [PATCH 3/4] drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2) Tao Zhou
2022-03-16 13:54 ` Zhang, Hawking
2022-03-16 9:26 ` [PATCH 4/4] drm/amdkfd: add RAS poison consumption handling for UTCL2 (v2) Tao Zhou
2022-03-16 13:56 ` Zhang, Hawking
2022-03-16 13:48 ` [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Zhang, Hawking
2022-03-16 13:55 ` Zhang, Hawking
2022-03-16 14:04 ` Felix Kuehling
2022-03-17 2:13 ` Zhou1, Tao
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.