All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption
@ 2022-03-16  9:26 Tao Zhou
  2022-03-16  9:26 ` [PATCH 2/4] drm/amdkfd: replace source_id with client_id for RAS poison consumption Tao Zhou
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Tao Zhou @ 2022-03-16  9:26 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, Felix.Kuehling, stanley.yang, yipeng.chai
  Cc: Tao Zhou

Combine reading and setting poison flag as one atomic operation
and add print message for the function.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 7eedbcd14828..a992798ff8b6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -93,20 +93,19 @@ enum SQ_INTERRUPT_ERROR_TYPE {
 static void event_interrupt_poison_consumption(struct kfd_dev *dev,
 				uint16_t pasid, uint16_t source_id)
 {
-	int ret = -EINVAL;
+	int old_poison, ret = -EINVAL;
 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 
 	if (!p)
 		return;
 
 	/* all queues of a process will be unmapped in one time */
-	if (atomic_read(&p->poison)) {
-		kfd_unref_process(p);
+	old_poison = atomic_cmpxchg(&p->poison, 0, 1);
+	kfd_unref_process(p);
+	if (old_poison)
 		return;
-	}
 
-	atomic_set(&p->poison, 1);
-	kfd_unref_process(p);
+	pr_warn("RAS poison consumption handling\n");
 
 	switch (source_id) {
 	case SOC15_INTSRC_SQ_INTERRUPT_MSG:
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/4] drm/amdkfd: replace source_id with client_id for RAS poison consumption
  2022-03-16  9:26 [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Tao Zhou
@ 2022-03-16  9:26 ` Tao Zhou
  2022-03-16 13:52   ` Zhang, Hawking
  2022-03-16  9:26 ` [PATCH 3/4] drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2) Tao Zhou
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: Tao Zhou @ 2022-03-16  9:26 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, Felix.Kuehling, stanley.yang, yipeng.chai
  Cc: Tao Zhou

client_id is more accruate here and we can deal with more different
cases.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 25 ++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index a992798ff8b6..e5f03f79546f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -91,7 +91,7 @@ enum SQ_INTERRUPT_ERROR_TYPE {
 #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
 
 static void event_interrupt_poison_consumption(struct kfd_dev *dev,
-				uint16_t pasid, uint16_t source_id)
+				uint16_t pasid, uint16_t client_id)
 {
 	int old_poison, ret = -EINVAL;
 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
@@ -107,11 +107,24 @@ static void event_interrupt_poison_consumption(struct kfd_dev *dev,
 
 	pr_warn("RAS poison consumption handling\n");
 
-	switch (source_id) {
-	case SOC15_INTSRC_SQ_INTERRUPT_MSG:
+	switch (client_id) {
+	case SOC15_IH_CLIENTID_GRBM_CP:
+	case SOC15_IH_CLIENTID_SE0SH:
+	case SOC15_IH_CLIENTID_SE1SH:
+	case SOC15_IH_CLIENTID_SE2SH:
+	case SOC15_IH_CLIENTID_SE3SH:
+	case SOC15_IH_CLIENTID_UTCL2:
 		ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
 		break;
-	case SOC15_INTSRC_SDMA_ECC:
+	case SOC15_IH_CLIENTID_SDMA0:
+	case SOC15_IH_CLIENTID_SDMA1:
+	case SOC15_IH_CLIENTID_SDMA2:
+	case SOC15_IH_CLIENTID_SDMA3:
+	case SOC15_IH_CLIENTID_SDMA4:
+	case SOC15_IH_CLIENTID_SDMA5:
+	case SOC15_IH_CLIENTID_SDMA6:
+	case SOC15_IH_CLIENTID_SDMA7:
+		break;
 	default:
 		break;
 	}
@@ -269,7 +282,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 					sq_intr_err);
 				if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
 					sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
-					event_interrupt_poison_consumption(dev, pasid, source_id);
+					event_interrupt_poison_consumption(dev, pasid, client_id);
 					return;
 				}
 				break;
@@ -290,7 +303,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 		if (source_id == SOC15_INTSRC_SDMA_TRAP) {
 			kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
 		} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
-			event_interrupt_poison_consumption(dev, pasid, source_id);
+			event_interrupt_poison_consumption(dev, pasid, client_id);
 			return;
 		}
 	} else if (client_id == SOC15_IH_CLIENTID_VMC ||
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 3/4] drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2)
  2022-03-16  9:26 [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Tao Zhou
  2022-03-16  9:26 ` [PATCH 2/4] drm/amdkfd: replace source_id with client_id for RAS poison consumption Tao Zhou
@ 2022-03-16  9:26 ` Tao Zhou
  2022-03-16 13:54   ` Zhang, Hawking
  2022-03-16  9:26 ` [PATCH 4/4] drm/amdkfd: add RAS poison consumption handling for UTCL2 (v2) Tao Zhou
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: Tao Zhou @ 2022-03-16  9:26 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, Felix.Kuehling, stanley.yang, yipeng.chai
  Cc: Tao Zhou

Add help functions to query and reset RAS UTCL2 poison status.

v2: implement it on amdgpu side and kfd only calls it.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  8 ++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c    | 14 ++++++++++++++
 4 files changed, 24 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 6ca1db3c243f..c18c4be1e4ac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -724,3 +724,11 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo
 	else if (reset)
 		amdgpu_amdkfd_gpu_reset(adev);
 }
+
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
+{
+	if (adev->gfx.ras->query_utcl2_poison_status)
+		return adev->gfx.ras->query_utcl2_poison_status(adev);
+	else
+		return false;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 4cb14c2fe53f..0838926a8ef0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -301,6 +301,7 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
 bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev);
 
 #if IS_ENABLED(CONFIG_HSA_AMD)
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index dcb3c7871c73..5ed9b8a4c571 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -202,6 +202,7 @@ struct amdgpu_cu_info {
 struct amdgpu_gfx_ras {
 	struct amdgpu_ras_block_object  ras_block;
 	void (*enable_watchdog_timer)(struct amdgpu_device *adev);
+	bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_gfx_funcs {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
index 7653ebd0e67b..e0890e00eedf 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -1930,6 +1930,19 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev)
 	mutex_unlock(&adev->grbm_idx_mutex);
 }
 
+static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device *adev)
+{
+	uint32_t status = 0;
+	struct amdgpu_vmhub *hub;
+
+	hub = &adev->vmhub[AMDGPU_GFXHUB_0];
+	status = RREG32(hub->vm_l2_pro_fault_status);
+	/* reset page fault status */
+	WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
+
+	return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+}
+
 struct amdgpu_ras_block_hw_ops  gfx_v9_4_2_ras_ops = {
 		.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
 		.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
@@ -1943,4 +1956,5 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
 		.hw_ops = &gfx_v9_4_2_ras_ops,
 	},
 	.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
+	.query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status,
 };
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 4/4] drm/amdkfd: add RAS poison consumption handling for UTCL2 (v2)
  2022-03-16  9:26 [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Tao Zhou
  2022-03-16  9:26 ` [PATCH 2/4] drm/amdkfd: replace source_id with client_id for RAS poison consumption Tao Zhou
  2022-03-16  9:26 ` [PATCH 3/4] drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2) Tao Zhou
@ 2022-03-16  9:26 ` Tao Zhou
  2022-03-16 13:56   ` Zhang, Hawking
  2022-03-16 13:48 ` [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Zhang, Hawking
  2022-03-16 14:04 ` Felix Kuehling
  4 siblings, 1 reply; 11+ messages in thread
From: Tao Zhou @ 2022-03-16  9:26 UTC (permalink / raw)
  To: amd-gfx, hawking.zhang, Felix.Kuehling, stanley.yang, yipeng.chai
  Cc: Tao Zhou

Do RAS page retirement and use gpu reset as fallback in UTCL2 fault
handler.

v2: replace vm fault event with posion consumed event in UTCL2
poison consumption.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index e5f03f79546f..55ee062a8496 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -312,6 +312,12 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 		struct kfd_vm_fault_info info = {0};
 		uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
 
+		if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
+		    amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
+			event_interrupt_poison_consumption(dev, pasid, client_id);
+			return;
+		}
+
 		info.vmid = vmid;
 		info.mc_id = client_id;
 		info.page_addr = ih_ring_entry[4] |
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* RE: [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption
  2022-03-16  9:26 [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Tao Zhou
                   ` (2 preceding siblings ...)
  2022-03-16  9:26 ` [PATCH 4/4] drm/amdkfd: add RAS poison consumption handling for UTCL2 (v2) Tao Zhou
@ 2022-03-16 13:48 ` Zhang, Hawking
  2022-03-16 13:55   ` Zhang, Hawking
  2022-03-16 14:04 ` Felix Kuehling
  4 siblings, 1 reply; 11+ messages in thread
From: Zhang, Hawking @ 2022-03-16 13:48 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Kuehling, Felix, Yang, Stanley, Chai, Thomas

[AMD Official Use Only]

+       pr_warn("RAS poison consumption handling\n");

Given you already print out this, might be better add some more information to help identify which blocks issued this poison consumption interrupt. e.g. add client_id or source_id

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, March 16, 2022 17:26
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption

Combine reading and setting poison flag as one atomic operation and add print message for the function.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 7eedbcd14828..a992798ff8b6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -93,20 +93,19 @@ enum SQ_INTERRUPT_ERROR_TYPE {  static void event_interrupt_poison_consumption(struct kfd_dev *dev,
                                uint16_t pasid, uint16_t source_id)  {
-       int ret = -EINVAL;
+       int old_poison, ret = -EINVAL;
        struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);

        if (!p)
                return;

        /* all queues of a process will be unmapped in one time */
-       if (atomic_read(&p->poison)) {
-               kfd_unref_process(p);
+       old_poison = atomic_cmpxchg(&p->poison, 0, 1);
+       kfd_unref_process(p);
+       if (old_poison)
                return;
-       }

-       atomic_set(&p->poison, 1);
-       kfd_unref_process(p);
+       pr_warn("RAS poison consumption handling\n");

        switch (source_id) {
        case SOC15_INTSRC_SQ_INTERRUPT_MSG:
--
2.35.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* RE: [PATCH 2/4] drm/amdkfd: replace source_id with client_id for RAS poison consumption
  2022-03-16  9:26 ` [PATCH 2/4] drm/amdkfd: replace source_id with client_id for RAS poison consumption Tao Zhou
@ 2022-03-16 13:52   ` Zhang, Hawking
  0 siblings, 0 replies; 11+ messages in thread
From: Zhang, Hawking @ 2022-03-16 13:52 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Kuehling, Felix, Yang, Stanley, Chai, Thomas

[AMD Official Use Only]

+       case SOC15_IH_CLIENTID_SDMA5:
+       case SOC15_IH_CLIENTID_SDMA6:
+       case SOC15_IH_CLIENTID_SDMA7:

Please drop the unnecessary case here since the feature is not available on all the ASICs.

+       case SOC15_IH_CLIENTID_GRBM_CP:
+       case SOC15_IH_CLIENTID_SE0SH:
+       case SOC15_IH_CLIENTID_SE1SH:
+       case SOC15_IH_CLIENTID_SE2SH:
+       case SOC15_IH_CLIENTID_SE3SH:

And I also think we can reduce the cases for SQ interrupt as well

With above addressed, the patch is

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, March 16, 2022 17:26
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 2/4] drm/amdkfd: replace source_id with client_id for RAS poison consumption

client_id is more accruate here and we can deal with more different cases.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 25 ++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index a992798ff8b6..e5f03f79546f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -91,7 +91,7 @@ enum SQ_INTERRUPT_ERROR_TYPE {  #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20

 static void event_interrupt_poison_consumption(struct kfd_dev *dev,
-                               uint16_t pasid, uint16_t source_id)
+                               uint16_t pasid, uint16_t client_id)
 {
        int old_poison, ret = -EINVAL;
        struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
@@ -107,11 +107,24 @@ static void event_interrupt_poison_consumption(struct kfd_dev *dev,

        pr_warn("RAS poison consumption handling\n");

-       switch (source_id) {
-       case SOC15_INTSRC_SQ_INTERRUPT_MSG:
+       switch (client_id) {
+       case SOC15_IH_CLIENTID_GRBM_CP:
+       case SOC15_IH_CLIENTID_SE0SH:
+       case SOC15_IH_CLIENTID_SE1SH:
+       case SOC15_IH_CLIENTID_SE2SH:
+       case SOC15_IH_CLIENTID_SE3SH:
+       case SOC15_IH_CLIENTID_UTCL2:
                ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
                break;
-       case SOC15_INTSRC_SDMA_ECC:
+       case SOC15_IH_CLIENTID_SDMA0:
+       case SOC15_IH_CLIENTID_SDMA1:
+       case SOC15_IH_CLIENTID_SDMA2:
+       case SOC15_IH_CLIENTID_SDMA3:
+       case SOC15_IH_CLIENTID_SDMA4:
+       case SOC15_IH_CLIENTID_SDMA5:
+       case SOC15_IH_CLIENTID_SDMA6:
+       case SOC15_IH_CLIENTID_SDMA7:
+               break;
        default:
                break;
        }
@@ -269,7 +282,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
                                        sq_intr_err);
                                if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
                                        sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
-                                       event_interrupt_poison_consumption(dev, pasid, source_id);
+                                       event_interrupt_poison_consumption(dev, pasid, client_id);
                                        return;
                                }
                                break;
@@ -290,7 +303,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
                if (source_id == SOC15_INTSRC_SDMA_TRAP) {
                        kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
                } else if (source_id == SOC15_INTSRC_SDMA_ECC) {
-                       event_interrupt_poison_consumption(dev, pasid, source_id);
+                       event_interrupt_poison_consumption(dev, pasid, client_id);
                        return;
                }
        } else if (client_id == SOC15_IH_CLIENTID_VMC ||
--
2.35.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* RE: [PATCH 3/4] drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2)
  2022-03-16  9:26 ` [PATCH 3/4] drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2) Tao Zhou
@ 2022-03-16 13:54   ` Zhang, Hawking
  0 siblings, 0 replies; 11+ messages in thread
From: Zhang, Hawking @ 2022-03-16 13:54 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Kuehling, Felix, Yang, Stanley, Chai, Thomas

[AMD Official Use Only]

V2 looks good to me

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, March 16, 2022 17:26
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 3/4] drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2)

Add help functions to query and reset RAS UTCL2 poison status.

v2: implement it on amdgpu side and kfd only calls it.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  8 ++++++++  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c    | 14 ++++++++++++++
 4 files changed, 24 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 6ca1db3c243f..c18c4be1e4ac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -724,3 +724,11 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo
        else if (reset)
                amdgpu_amdkfd_gpu_reset(adev);
 }
+
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device
+*adev) {
+       if (adev->gfx.ras->query_utcl2_poison_status)
+               return adev->gfx.ras->query_utcl2_poison_status(adev);
+       else
+               return false;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 4cb14c2fe53f..0838926a8ef0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -301,6 +301,7 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,  bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);  void amdgpu_amdkfd_block_mmu_notifications(void *p);  int amdgpu_amdkfd_criu_resume(void *p);
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device
+*adev);

 #if IS_ENABLED(CONFIG_HSA_AMD)
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index dcb3c7871c73..5ed9b8a4c571 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -202,6 +202,7 @@ struct amdgpu_cu_info {  struct amdgpu_gfx_ras {
        struct amdgpu_ras_block_object  ras_block;
        void (*enable_watchdog_timer)(struct amdgpu_device *adev);
+       bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);
 };

 struct amdgpu_gfx_funcs {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
index 7653ebd0e67b..e0890e00eedf 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -1930,6 +1930,19 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev)
        mutex_unlock(&adev->grbm_idx_mutex);
 }

+static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device
+*adev) {
+       uint32_t status = 0;
+       struct amdgpu_vmhub *hub;
+
+       hub = &adev->vmhub[AMDGPU_GFXHUB_0];
+       status = RREG32(hub->vm_l2_pro_fault_status);
+       /* reset page fault status */
+       WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
+
+       return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); }
+
 struct amdgpu_ras_block_hw_ops  gfx_v9_4_2_ras_ops = {
                .ras_error_inject = &gfx_v9_4_2_ras_error_inject,
                .query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
@@ -1943,4 +1956,5 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
                .hw_ops = &gfx_v9_4_2_ras_ops,
        },
        .enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
+       .query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status,
 };
--
2.35.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* RE: [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption
  2022-03-16 13:48 ` [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Zhang, Hawking
@ 2022-03-16 13:55   ` Zhang, Hawking
  0 siblings, 0 replies; 11+ messages in thread
From: Zhang, Hawking @ 2022-03-16 13:55 UTC (permalink / raw)
  To: Zhang, Hawking, Zhou1, Tao, amd-gfx, Kuehling, Felix, Yang,
	Stanley, Chai, Thomas

[AMD Official Use Only]

Hit send too quickly. With below addressed, the patch is

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>

Regards,
Hawking
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Zhang, Hawking
Sent: Wednesday, March 16, 2022 21:49
To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org; Kuehling, Felix <Felix.Kuehling@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Subject: RE: [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption

[AMD Official Use Only]

[AMD Official Use Only]

+       pr_warn("RAS poison consumption handling\n");

Given you already print out this, might be better add some more information to help identify which blocks issued this poison consumption interrupt. e.g. add client_id or source_id

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, March 16, 2022 17:26
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption

Combine reading and setting poison flag as one atomic operation and add print message for the function.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 7eedbcd14828..a992798ff8b6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -93,20 +93,19 @@ enum SQ_INTERRUPT_ERROR_TYPE {  static void event_interrupt_poison_consumption(struct kfd_dev *dev,
                                uint16_t pasid, uint16_t source_id)  {
-       int ret = -EINVAL;
+       int old_poison, ret = -EINVAL;
        struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);

        if (!p)
                return;

        /* all queues of a process will be unmapped in one time */
-       if (atomic_read(&p->poison)) {
-               kfd_unref_process(p);
+       old_poison = atomic_cmpxchg(&p->poison, 0, 1);
+       kfd_unref_process(p);
+       if (old_poison)
                return;
-       }

-       atomic_set(&p->poison, 1);
-       kfd_unref_process(p);
+       pr_warn("RAS poison consumption handling\n");

        switch (source_id) {
        case SOC15_INTSRC_SQ_INTERRUPT_MSG:
--
2.35.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* RE: [PATCH 4/4] drm/amdkfd: add RAS poison consumption handling for UTCL2 (v2)
  2022-03-16  9:26 ` [PATCH 4/4] drm/amdkfd: add RAS poison consumption handling for UTCL2 (v2) Tao Zhou
@ 2022-03-16 13:56   ` Zhang, Hawking
  0 siblings, 0 replies; 11+ messages in thread
From: Zhang, Hawking @ 2022-03-16 13:56 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx, Kuehling, Felix, Yang, Stanley, Chai, Thomas

[AMD Official Use Only]

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>

Regards,
Hawking
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, March 16, 2022 17:26
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 4/4] drm/amdkfd: add RAS poison consumption handling for UTCL2 (v2)

Do RAS page retirement and use gpu reset as fallback in UTCL2 fault handler.

v2: replace vm fault event with posion consumed event in UTCL2 poison consumption.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index e5f03f79546f..55ee062a8496 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -312,6 +312,12 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
                struct kfd_vm_fault_info info = {0};
                uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);

+               if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
+                   amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
+                       event_interrupt_poison_consumption(dev, pasid, client_id);
+                       return;
+               }
+
                info.vmid = vmid;
                info.mc_id = client_id;
                info.page_addr = ih_ring_entry[4] |
--
2.35.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption
  2022-03-16  9:26 [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Tao Zhou
                   ` (3 preceding siblings ...)
  2022-03-16 13:48 ` [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Zhang, Hawking
@ 2022-03-16 14:04 ` Felix Kuehling
  2022-03-17  2:13   ` Zhou1, Tao
  4 siblings, 1 reply; 11+ messages in thread
From: Felix Kuehling @ 2022-03-16 14:04 UTC (permalink / raw)
  To: Tao Zhou, amd-gfx, hawking.zhang, stanley.yang, yipeng.chai

Am 2022-03-16 um 05:26 schrieb Tao Zhou:
> Combine reading and setting poison flag as one atomic operation
> and add print message for the function.
>
> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 11 +++++------
>   1 file changed, 5 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index 7eedbcd14828..a992798ff8b6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -93,20 +93,19 @@ enum SQ_INTERRUPT_ERROR_TYPE {
>   static void event_interrupt_poison_consumption(struct kfd_dev *dev,
>   				uint16_t pasid, uint16_t source_id)
>   {
> -	int ret = -EINVAL;
> +	int old_poison, ret = -EINVAL;
>   	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
>   
>   	if (!p)
>   		return;
>   
>   	/* all queues of a process will be unmapped in one time */
> -	if (atomic_read(&p->poison)) {
> -		kfd_unref_process(p);
> +	old_poison = atomic_cmpxchg(&p->poison, 0, 1);
> +	kfd_unref_process(p);
> +	if (old_poison)
>   		return;
> -	}
>   
> -	atomic_set(&p->poison, 1);
> -	kfd_unref_process(p);
> +	pr_warn("RAS poison consumption handling\n");

If this left over from debugging? Or did you mean to add a warning 
message here? Either way, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


>   
>   	switch (source_id) {
>   	case SOC15_INTSRC_SQ_INTERRUPT_MSG:

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption
  2022-03-16 14:04 ` Felix Kuehling
@ 2022-03-17  2:13   ` Zhou1, Tao
  0 siblings, 0 replies; 11+ messages in thread
From: Zhou1, Tao @ 2022-03-17  2:13 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx, Zhang, Hawking, Yang, Stanley, Chai, Thomas

[AMD Official Use Only]



> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling@amd.com>
> Sent: Wednesday, March 16, 2022 10:04 PM
> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org; Zhang,
> Hawking <Hawking.Zhang@amd.com>; Yang, Stanley
> <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
> Subject: Re: [PATCH 1/4] drm/amdkfd: refine
> event_interrupt_poison_consumption
> 
> Am 2022-03-16 um 05:26 schrieb Tao Zhou:
> > Combine reading and setting poison flag as one atomic operation and
> > add print message for the function.
> >
> > Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 11 +++++------
> >   1 file changed, 5 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> > index 7eedbcd14828..a992798ff8b6 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> > @@ -93,20 +93,19 @@ enum SQ_INTERRUPT_ERROR_TYPE {
> >   static void event_interrupt_poison_consumption(struct kfd_dev *dev,
> >   				uint16_t pasid, uint16_t source_id)
> >   {
> > -	int ret = -EINVAL;
> > +	int old_poison, ret = -EINVAL;
> >   	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
> >
> >   	if (!p)
> >   		return;
> >
> >   	/* all queues of a process will be unmapped in one time */
> > -	if (atomic_read(&p->poison)) {
> > -		kfd_unref_process(p);
> > +	old_poison = atomic_cmpxchg(&p->poison, 0, 1);
> > +	kfd_unref_process(p);
> > +	if (old_poison)
> >   		return;
> > -	}
> >
> > -	atomic_set(&p->poison, 1);
> > -	kfd_unref_process(p);
> > +	pr_warn("RAS poison consumption handling\n");
> 
> If this left over from debugging? Or did you mean to add a warning message
> here? Either way, the patch is

Both are my intention, poison consumption will be executed quietly if everything goes well, the message is helpful for debug and QA's test.

> 
> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
> 
> 
> >
> >   	switch (source_id) {
> >   	case SOC15_INTSRC_SQ_INTERRUPT_MSG:

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2022-03-17  2:13 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-16  9:26 [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Tao Zhou
2022-03-16  9:26 ` [PATCH 2/4] drm/amdkfd: replace source_id with client_id for RAS poison consumption Tao Zhou
2022-03-16 13:52   ` Zhang, Hawking
2022-03-16  9:26 ` [PATCH 3/4] drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2) Tao Zhou
2022-03-16 13:54   ` Zhang, Hawking
2022-03-16  9:26 ` [PATCH 4/4] drm/amdkfd: add RAS poison consumption handling for UTCL2 (v2) Tao Zhou
2022-03-16 13:56   ` Zhang, Hawking
2022-03-16 13:48 ` [PATCH 1/4] drm/amdkfd: refine event_interrupt_poison_consumption Zhang, Hawking
2022-03-16 13:55   ` Zhang, Hawking
2022-03-16 14:04 ` Felix Kuehling
2022-03-17  2:13   ` Zhou1, Tao

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.