amd-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page
@ 2024-04-18  2:58 YiPeng Chai
  2024-04-18  2:58 ` [PATCH 02/15] drm/amdgpu: add message fifo to handle RAS poison events YiPeng Chai
                   ` (14 more replies)
  0 siblings, 15 replies; 29+ messages in thread
From: YiPeng Chai @ 2024-04-18  2:58 UTC (permalink / raw)
  To: amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang, YiPeng Chai

Add interface to reserve bad page.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 19 +++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  4 ++++
 2 files changed, 23 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2c97cb80d79a..05782d68f073 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2782,6 +2782,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 		}
 	}
 
+	mutex_init(&con->page_rsv_lock);
 	mutex_init(&con->page_retirement_lock);
 	init_waitqueue_head(&con->page_retirement_wq);
 	atomic_set(&con->page_retirement_req_cnt, 0);
@@ -2835,6 +2836,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
 
 	atomic_set(&con->page_retirement_req_cnt, 0);
 
+	mutex_destroy(&con->page_rsv_lock);
+
 	cancel_work_sync(&con->recovery_work);
 
 	mutex_lock(&con->recovery_lock);
@@ -4278,3 +4281,19 @@ void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances)
 			amdgpu_ras_boot_time_error_reporting(adev, i, boot_error);
 	}
 }
+
+int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
+	uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT;
+	int ret = 0;
+
+	mutex_lock(&con->page_rsv_lock);
+	ret = amdgpu_vram_mgr_query_page_status(mgr, start);
+	if (ret == -ENOENT)
+		ret = amdgpu_vram_mgr_reserve_range(mgr, start, AMDGPU_GPU_PAGE_SIZE);
+	mutex_unlock(&con->page_rsv_lock);
+
+	return ret;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 8d26989c75c8..ab5bf573378e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -500,6 +500,7 @@ struct amdgpu_ras {
 	wait_queue_head_t page_retirement_wq;
 	struct mutex page_retirement_lock;
 	atomic_t page_retirement_req_cnt;
+	struct mutex page_rsv_lock;
 	/* Fatal error detected flag */
 	atomic_t fed;
 
@@ -909,4 +910,7 @@ bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
 
 bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id);
 u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type);
+
+int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn);
+
 #endif
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 02/15] drm/amdgpu: add message fifo to handle RAS poison events
  2024-04-18  2:58 [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page YiPeng Chai
@ 2024-04-18  2:58 ` YiPeng Chai
  2024-04-18  2:58 ` [PATCH 03/15] drm/amdgpu: prepare for logging ecc errors YiPeng Chai
                   ` (13 subsequent siblings)
  14 siblings, 0 replies; 29+ messages in thread
From: YiPeng Chai @ 2024-04-18  2:58 UTC (permalink / raw)
  To: amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang, YiPeng Chai

Add message fifo to handle RAS poison events.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 32 +++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 18 ++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 05782d68f073..0de5a22e7126 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2693,6 +2693,37 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
 	}
 }
 
+int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
+		enum amdgpu_ras_block block, uint16_t pasid,
+		pasid_notify pasid_fn, void *data, uint32_t reset)
+{
+	int ret = 0;
+	struct ras_poison_msg poison_msg;
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+	memset(&poison_msg, 0, sizeof(poison_msg));
+	poison_msg.block = block;
+	poison_msg.pasid = pasid;
+	poison_msg.reset = reset;
+	poison_msg.pasid_fn = pasid_fn;
+	poison_msg.data = data;
+
+	ret = kfifo_put(&con->poison_fifo, poison_msg);
+	if (!ret) {
+		dev_err(adev->dev, "Poison message fifo is full!\n");
+		return -ENOSPC;
+	}
+
+	return 0;
+}
+
+static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev,
+		struct ras_poison_msg *poison_msg)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+	return kfifo_get(&con->poison_fifo, poison_msg);
+}
 static int amdgpu_ras_page_retirement_thread(void *param)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)param;
@@ -2783,6 +2814,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 	}
 
 	mutex_init(&con->page_rsv_lock);
+	INIT_KFIFO(con->poison_fifo);
 	mutex_init(&con->page_retirement_lock);
 	init_waitqueue_head(&con->page_retirement_wq);
 	atomic_set(&con->page_retirement_req_cnt, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index ab5bf573378e..2b15996f1ede 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -26,6 +26,7 @@
 
 #include <linux/debugfs.h>
 #include <linux/list.h>
+#include <linux/kfifo.h>
 #include "ta_ras_if.h"
 #include "amdgpu_ras_eeprom.h"
 #include "amdgpu_smuio.h"
@@ -442,6 +443,17 @@ struct ras_query_context {
 	u64 event_id;
 };
 
+typedef int (*pasid_notify)(struct amdgpu_device *adev,
+		uint16_t pasid, void *data);
+
+struct ras_poison_msg {
+	enum amdgpu_ras_block block;
+	uint16_t pasid;
+	uint32_t reset;
+	pasid_notify pasid_fn;
+	void *data;
+};
+
 struct amdgpu_ras {
 	/* ras infrastructure */
 	/* for ras itself. */
@@ -501,6 +513,8 @@ struct amdgpu_ras {
 	struct mutex page_retirement_lock;
 	atomic_t page_retirement_req_cnt;
 	struct mutex page_rsv_lock;
+	DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
+
 	/* Fatal error detected flag */
 	atomic_t fed;
 
@@ -913,4 +927,8 @@ u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type
 
 int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn);
 
+int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
+		enum amdgpu_ras_block block, uint16_t pasid,
+		pasid_notify pasid_fn, void *data, uint32_t reset);
+
 #endif
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 03/15] drm/amdgpu: prepare for logging ecc errors
  2024-04-18  2:58 [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page YiPeng Chai
  2024-04-18  2:58 ` [PATCH 02/15] drm/amdgpu: add message fifo to handle RAS poison events YiPeng Chai
@ 2024-04-18  2:58 ` YiPeng Chai
  2024-04-18  2:58 ` [PATCH 04/15] drm/amdgpu: add poison creation handler YiPeng Chai
                   ` (12 subsequent siblings)
  14 siblings, 0 replies; 29+ messages in thread
From: YiPeng Chai @ 2024-04-18  2:58 UTC (permalink / raw)
  To: amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang, YiPeng Chai

Prepare for logging ecc errors.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 33 +++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 23 +++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 0de5a22e7126..64e6e20c6de7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2724,6 +2724,36 @@ static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev,
 
 	return kfifo_get(&con->poison_fifo, poison_msg);
 }
+
+static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
+{
+	mutex_init(&ecc_log->lock);
+
+	/* Set any value as siphash key */
+	memset(&ecc_log->ecc_key, 0xad, sizeof(ecc_log->ecc_key));
+
+	INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
+	ecc_log->de_updated = false;
+}
+
+static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
+{
+	struct radix_tree_iter iter;
+	void __rcu **slot;
+	struct ras_ecc_err *ecc_err;
+
+	mutex_lock(&ecc_log->lock);
+	radix_tree_for_each_slot(slot, &ecc_log->de_page_tree, &iter, 0) {
+		ecc_err = radix_tree_deref_slot(slot);
+		kfree(ecc_err->err_pages.pfn);
+		kfree(ecc_err);
+		radix_tree_iter_delete(&ecc_log->de_page_tree, &iter, slot);
+	}
+	mutex_unlock(&ecc_log->lock);
+
+	mutex_destroy(&ecc_log->lock);
+	ecc_log->de_updated = false;
+}
 static int amdgpu_ras_page_retirement_thread(void *param)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)param;
@@ -2825,6 +2855,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 		dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n");
 	}
 
+	amdgpu_ras_ecc_log_init(&con->umc_ecc_log);
 #ifdef CONFIG_X86_MCE_AMD
 #ifdef HAVE_SMCA_UMC_V2
 	if ((adev->asic_type == CHIP_ALDEBARAN) &&
@@ -2872,6 +2903,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
 
 	cancel_work_sync(&con->recovery_work);
 
+	amdgpu_ras_ecc_log_fini(&con->umc_ecc_log);
+
 	mutex_lock(&con->recovery_lock);
 	con->eh_data = NULL;
 	kfree(data->bps);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 2b15996f1ede..634654cf2634 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -27,6 +27,8 @@
 #include <linux/debugfs.h>
 #include <linux/list.h>
 #include <linux/kfifo.h>
+#include <linux/radix-tree.h>
+#include <linux/siphash.h>
 #include "ta_ras_if.h"
 #include "amdgpu_ras_eeprom.h"
 #include "amdgpu_smuio.h"
@@ -454,6 +456,26 @@ struct ras_poison_msg {
 	void *data;
 };
 
+struct ras_err_pages {
+	uint32_t count;
+	uint64_t *pfn;
+};
+
+struct ras_ecc_err {
+	u64 hash_index;
+	uint64_t status;
+	uint64_t ipid;
+	uint64_t addr;
+	struct ras_err_pages err_pages;
+};
+
+struct ras_ecc_log_info {
+	struct mutex lock;
+	siphash_key_t ecc_key;
+	struct radix_tree_root de_page_tree;
+	bool	de_updated;
+};
+
 struct amdgpu_ras {
 	/* ras infrastructure */
 	/* for ras itself. */
@@ -514,6 +536,7 @@ struct amdgpu_ras {
 	atomic_t page_retirement_req_cnt;
 	struct mutex page_rsv_lock;
 	DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
+	struct ras_ecc_log_info  umc_ecc_log;
 
 	/* Fatal error detected flag */
 	atomic_t fed;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 04/15] drm/amdgpu: add poison creation handler
  2024-04-18  2:58 [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page YiPeng Chai
  2024-04-18  2:58 ` [PATCH 02/15] drm/amdgpu: add message fifo to handle RAS poison events YiPeng Chai
  2024-04-18  2:58 ` [PATCH 03/15] drm/amdgpu: prepare for logging ecc errors YiPeng Chai
@ 2024-04-18  2:58 ` YiPeng Chai
  2024-04-25  2:32   ` Zhang, Hawking
  2024-04-18  2:58 ` [PATCH 05/15] drm/amdgpu: add interface to update umc v12_0 ecc status YiPeng Chai
                   ` (11 subsequent siblings)
  14 siblings, 1 reply; 29+ messages in thread
From: YiPeng Chai @ 2024-04-18  2:58 UTC (permalink / raw)
  To: amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang, YiPeng Chai

Add poison creation handler.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 74 +++++++++++++++++++++++--
 1 file changed, 69 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 64e6e20c6de7..126616eaeec1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2080,6 +2080,17 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj
 {
 	dev_info(obj->adev->dev,
 		"Poison is created\n");
+
+	if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
+		struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev);
+
+		amdgpu_ras_put_poison_req(obj->adev,
+			AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false);
+
+		atomic_inc(&con->page_retirement_req_cnt);
+
+		wake_up(&con->page_retirement_wq);
+	}
 }
 
 static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
@@ -2754,10 +2765,54 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
 	mutex_destroy(&ecc_log->lock);
 	ecc_log->de_updated = false;
 }
+
+static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
+			enum amdgpu_ras_block ras_block, uint32_t timeout_ms)
+{
+	int ret = 0;
+	struct ras_ecc_log_info *ecc_log;
+	struct ras_query_if info;
+	uint32_t timeout = timeout_ms;
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+	memset(&info, 0, sizeof(info));
+	info.head.block = ras_block;
+
+	ecc_log = &ras->umc_ecc_log;
+	ecc_log->de_updated = false;
+	do {
+		ret = amdgpu_ras_query_error_status(adev, &info);
+		if (ret) {
+			dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret);
+			return ret;
+		}
+
+		if (timeout && !ecc_log->de_updated) {
+			msleep(1);
+			timeout--;
+		}
+	} while (timeout && !ecc_log->de_updated);
+
+	if (timeout_ms && !timeout) {
+		dev_warn(adev->dev, "Can't find deferred error\n");
+		return -ETIMEDOUT;
+	}
+
+	return 0;
+}
+
+static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
+					uint32_t timeout)
+{
+	amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout);
+}
+
 static int amdgpu_ras_page_retirement_thread(void *param)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)param;
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_poison_msg poison_msg;
+	enum amdgpu_ras_block ras_block;
 
 	while (!kthread_should_stop()) {
 
@@ -2768,13 +2823,22 @@ static int amdgpu_ras_page_retirement_thread(void *param)
 		if (kthread_should_stop())
 			break;
 
-		dev_info(adev->dev, "Start processing page retirement. request:%d\n",
-			atomic_read(&con->page_retirement_req_cnt));
-
 		atomic_dec(&con->page_retirement_req_cnt);
 
-		amdgpu_umc_bad_page_polling_timeout(adev,
-				0, MAX_UMC_POISON_POLLING_TIME_ASYNC);
+		if (!amdgpu_ras_get_poison_req(adev, &poison_msg))
+			continue;
+
+		ras_block = poison_msg.block;
+
+		dev_info(adev->dev, "Start processing ras block %s(%d)\n",
+				ras_block_str(ras_block), ras_block);
+
+		if (ras_block == AMDGPU_RAS_BLOCK__UMC)
+			amdgpu_ras_poison_creation_handler(adev,
+				MAX_UMC_POISON_POLLING_TIME_ASYNC);
+		else
+			amdgpu_umc_bad_page_polling_timeout(adev,
+				false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
 	}
 
 	return 0;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 05/15] drm/amdgpu: add interface to update umc v12_0 ecc status
  2024-04-18  2:58 [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page YiPeng Chai
                   ` (2 preceding siblings ...)
  2024-04-18  2:58 ` [PATCH 04/15] drm/amdgpu: add poison creation handler YiPeng Chai
@ 2024-04-18  2:58 ` YiPeng Chai
  2024-04-18  2:58 ` [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address YiPeng Chai
                   ` (10 subsequent siblings)
  14 siblings, 0 replies; 29+ messages in thread
From: YiPeng Chai @ 2024-04-18  2:58 UTC (permalink / raw)
  To: amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang, YiPeng Chai

Add interface to update umc v12_0 ecc status.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c       |  9 +++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h       |  6 +++++
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c        | 24 +++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.h        |  3 +++
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  |  5 ++++
 6 files changed, 49 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 126616eaeec1..702229abe7ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4213,6 +4213,8 @@ void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct ras_err_a
 {
 	struct ras_err_addr *mca_err_addr;
 
+	/* This function will be retired. */
+	return;
 	mca_err_addr = kzalloc(sizeof(*mca_err_addr), GFP_KERNEL);
 	if (!mca_err_addr)
 		return;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index f486510fc94c..7006a57277ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -437,3 +437,12 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
 
 	return 0;
 }
+
+int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
+				uint64_t status, uint64_t ipid, uint64_t addr)
+{
+	if (adev->umc.ras->update_ecc_status)
+		return adev->umc.ras->update_ecc_status(adev,
+					status, ipid, addr);
+	return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 563b0249247e..4f3834fa10a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -66,6 +66,8 @@ struct amdgpu_umc_ras {
 					void *ras_error_status);
 	bool (*check_ecc_err_status)(struct amdgpu_device *adev,
 			enum amdgpu_mca_error_type type, void *ras_error_status);
+	int (*update_ecc_status)(struct amdgpu_device *adev,
+			uint64_t status, uint64_t ipid, uint64_t addr);
 };
 
 struct amdgpu_umc_funcs {
@@ -122,4 +124,8 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
 
 int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
 			uint32_t reset, uint32_t timeout_ms);
+
+int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
+				uint64_t status, uint64_t ipid, uint64_t addr);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index a0122b22eda4..81435533c4a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -479,6 +479,29 @@ static int umc_v12_0_ras_late_init(struct amdgpu_device *adev, struct ras_common
 	return 0;
 }
 
+static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
+			uint64_t status, uint64_t ipid, uint64_t addr)
+{
+	uint16_t hwid, mcatype;
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+	hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
+	mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);
+
+	if ((hwid != MCA_UMC_HWID_V12_0) || (mcatype != MCA_UMC_MCATYPE_V12_0))
+		return 0;
+
+	if (!status)
+		return 0;
+
+	if (!umc_v12_0_is_deferred_error(adev, status))
+		return 0;
+
+	con->umc_ecc_log.de_updated = true;
+
+	return 0;
+}
+
 struct amdgpu_umc_ras umc_v12_0_ras = {
 	.ras_block = {
 		.hw_ops = &umc_v12_0_ras_hw_ops,
@@ -489,5 +512,6 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
 	.ecc_info_query_ras_error_count = umc_v12_0_ecc_info_query_ras_error_count,
 	.ecc_info_query_ras_error_address = umc_v12_0_ecc_info_query_ras_error_address,
 	.check_ecc_err_status = umc_v12_0_check_ecc_err_status,
+	.update_ecc_status = umc_v12_0_update_ecc_status,
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
index 1d5f44dcffdd..5c2d7e127608 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
@@ -62,6 +62,9 @@
 /* row bits in SOC physical address */
 #define UMC_V12_0_PA_R13_BIT 35
 
+#define MCA_UMC_HWID_V12_0     0x96
+#define MCA_UMC_MCATYPE_V12_0  0x0
+
 #define MCA_IPID_LO_2_UMC_CH(_ipid_lo) (((((_ipid_lo) >> 20) & 0x1) * 4) + \
 			(((_ipid_lo) >> 12) & 0xF))
 #define MCA_IPID_LO_2_UMC_INST(_ipid_lo) (((_ipid_lo) >> 21) & 0x7)
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 55d11ea8c717..8370c2130476 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2681,6 +2681,11 @@ static int mca_umc_mca_get_err_count(const struct mca_ras_info *mca_ras, struct
 	    umc_v12_0_is_correctable_error(adev, status0))
 		*count = (ext_error_code == 0) ? odecc_err_cnt : 1;
 
+	amdgpu_umc_update_ecc_status(adev,
+			entry->regs[MCA_REG_IDX_STATUS],
+			entry->regs[MCA_REG_IDX_IPID],
+			entry->regs[MCA_REG_IDX_ADDR]);
+
 	return 0;
 }
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address
  2024-04-18  2:58 [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page YiPeng Chai
                   ` (3 preceding siblings ...)
  2024-04-18  2:58 ` [PATCH 05/15] drm/amdgpu: add interface to update umc v12_0 ecc status YiPeng Chai
@ 2024-04-18  2:58 ` YiPeng Chai
  2024-04-25  3:02   ` Zhang, Hawking
  2024-04-18  2:58 ` [PATCH 07/15] drm/amdgpu: umc v12_0 logs ecc errors YiPeng Chai
                   ` (9 subsequent siblings)
  14 siblings, 1 reply; 29+ messages in thread
From: YiPeng Chai @ 2024-04-18  2:58 UTC (permalink / raw)
  To: amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang, YiPeng Chai

Umc v12_0 converts error address.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 94 +++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 12 ++++
 2 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 81435533c4a7..085dcfe16b5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -222,6 +222,66 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
 	}
 }
 
+static int umc_v12_0_convert_err_addr(struct amdgpu_device *adev,
+				struct ta_ras_query_address_input *addr_in,
+				uint64_t *pfns, int len)
+{
+	uint32_t col, row, row_xor, bank, channel_index;
+	uint64_t soc_pa, retired_page, column, err_addr;
+	struct ta_ras_query_address_output addr_out;
+	uint32_t pos = 0;
+
+	err_addr = addr_in->ma.err_addr;
+	addr_in->addr_type = TA_RAS_MCA_TO_PA;
+	if (psp_ras_query_address(&adev->psp, addr_in, &addr_out)) {
+		dev_warn(adev->dev, "Failed to query RAS physical address for 0x%llx",
+			err_addr);
+		return 0;
+	}
+
+	soc_pa = addr_out.pa.pa;
+	bank = addr_out.pa.bank;
+	channel_index = addr_out.pa.channel_idx;
+
+	col = (err_addr >> 1) & 0x1fULL;
+	row = (err_addr >> 10) & 0x3fffULL;
+	row_xor = row ^ (0x1ULL << 13);
+	/* clear [C3 C2] in soc physical address */
+	soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
+	/* clear [C4] in soc physical address */
+	soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
+
+	/* loop for all possibilities of [C4 C3 C2] */
+	for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
+		retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
+		retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT);
+
+		if (pos >= len)
+			return 0;
+		pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+
+		/* include column bit 0 and 1 */
+		col &= 0x3;
+		col |= (column << 2);
+		dev_info(adev->dev,
+			"Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
+			retired_page, row, col, bank, channel_index);
+
+		/* shift R13 bit */
+		retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
+
+		if (pos >= len)
+			return 0;
+		pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+
+		dev_info(adev->dev,
+			"Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
+			retired_page, row_xor, col, bank, channel_index);
+	}
+
+	return pos;
+}
+
 static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
 					uint32_t node_inst, uint32_t umc_inst,
 					uint32_t ch_inst, void *data)
@@ -482,8 +542,12 @@ static int umc_v12_0_ras_late_init(struct amdgpu_device *adev, struct ras_common
 static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
 			uint64_t status, uint64_t ipid, uint64_t addr)
 {
-	uint16_t hwid, mcatype;
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	uint16_t hwid, mcatype;
+	struct ta_ras_query_address_input addr_in;
+	uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
+	uint64_t err_addr;
+	int count;
 
 	hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
 	mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);
@@ -497,6 +561,34 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
 	if (!umc_v12_0_is_deferred_error(adev, status))
 		return 0;
 
+	err_addr = REG_GET_FIELD(addr,
+				MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
+
+	dev_info(adev->dev,
+		"UMC:IPID:0x%llx, socket:%llu, aid:%llu, inst:%llu, ch:%llu, err_addr:0x%llx\n",
+		ipid,
+		MCA_IPID_2_SOCKET_ID(ipid),
+		MCA_IPID_2_DIE_ID(ipid),
+		MCA_IPID_2_UMC_INST(ipid),
+		MCA_IPID_2_UMC_CH(ipid),
+		err_addr);
+
+	memset(page_pfn, 0, sizeof(page_pfn));
+
+	memset(&addr_in, 0, sizeof(addr_in));
+	addr_in.ma.err_addr = err_addr;
+	addr_in.ma.ch_inst = MCA_IPID_2_UMC_CH(ipid);
+	addr_in.ma.umc_inst = MCA_IPID_2_UMC_INST(ipid);
+	addr_in.ma.node_inst = MCA_IPID_2_DIE_ID(ipid);
+	addr_in.ma.socket_id = MCA_IPID_2_SOCKET_ID(ipid);
+
+	count = umc_v12_0_convert_err_addr(adev,
+				&addr_in, page_pfn, ARRAY_SIZE(page_pfn));
+	if (count <= 0) {
+		dev_warn(adev->dev, "Fail to convert error address! count:%d\n", count);
+		return 0;
+	}
+
 	con->umc_ecc_log.de_updated = true;
 
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
index 5c2d7e127608..b4974793850b 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
@@ -69,6 +69,18 @@
 			(((_ipid_lo) >> 12) & 0xF))
 #define MCA_IPID_LO_2_UMC_INST(_ipid_lo) (((_ipid_lo) >> 21) & 0x7)
 
+#define MCA_IPID_2_DIE_ID(ipid)  ((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) >> 2) & 0x03)
+
+#define MCA_IPID_2_UMC_CH(ipid) \
+	(MCA_IPID_LO_2_UMC_CH(REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo)))
+
+#define MCA_IPID_2_UMC_INST(ipid) \
+	(MCA_IPID_LO_2_UMC_INST(REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo)))
+
+#define MCA_IPID_2_SOCKET_ID(ipid) \
+	(((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo) & 0x1) << 2) | \
+	 (REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) & 0x03))
+
 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
 bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
 bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 07/15] drm/amdgpu: umc v12_0 logs ecc errors
  2024-04-18  2:58 [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page YiPeng Chai
                   ` (4 preceding siblings ...)
  2024-04-18  2:58 ` [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address YiPeng Chai
@ 2024-04-18  2:58 ` YiPeng Chai
  2024-04-18  2:58 ` [PATCH 08/15] drm/amdgpu: Add delay work to retire bad pages YiPeng Chai
                   ` (8 subsequent siblings)
  14 siblings, 0 replies; 29+ messages in thread
From: YiPeng Chai @ 2024-04-18  2:58 UTC (permalink / raw)
  To: amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang, YiPeng Chai

1. umc v12_0 logs ecc errors.
2. Reserve newly detected ecc error pages.
3. Add tag for bad pages, so that they can
   be retired later.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 67 +++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  7 ++-
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 41 ++++++++++++++-
 3 files changed, 113 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 7006a57277ef..0f2d765c4e2d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -21,10 +21,13 @@
  *
  */
 
+#include <linux/sort.h>
 #include "amdgpu.h"
 #include "umc_v6_7.h"
 #define MAX_UMC_POISON_POLLING_TIME_SYNC   20  //ms
 
+#define MAX_UMC_HASH_STRING_SIZE  256
+
 static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
 				    struct ras_err_data *err_data, uint64_t err_addr,
 				    uint32_t ch_inst, uint32_t umc_inst)
@@ -446,3 +449,67 @@ int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
 					status, ipid, addr);
 	return 0;
 }
+
+static int amdgpu_umc_uint64_cmp(const void *a, const void *b)
+{
+	uint64_t *addr_a = (uint64_t *)a;
+	uint64_t *addr_b = (uint64_t *)b;
+
+	if (*addr_a > *addr_b)
+		return 1;
+	else if (*addr_a < *addr_b)
+		return -1;
+	else
+		return 0;
+}
+
+/* Use string hash to avoid logging the same bad pages repeatedly */
+int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
+		uint64_t *pfns, int len, uint64_t *val)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	char buf[MAX_UMC_HASH_STRING_SIZE] = {0};
+	int offset = 0, i = 0;
+	uint64_t hash_val;
+
+	if (!pfns || !len)
+		return -EINVAL;
+
+	sort(pfns, len, sizeof(uint64_t), amdgpu_umc_uint64_cmp, NULL);
+
+	for (i = 0; i < len; i++)
+		offset += snprintf(&buf[offset], sizeof(buf) - offset, "%llx", pfns[i]);
+
+	hash_val = siphash(buf, offset, &con->umc_ecc_log.ecc_key);
+
+	*val = hash_val;
+
+	return 0;
+}
+
+int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
+		struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_ecc_log_info *ecc_log;
+	int ret;
+
+	ecc_log = &con->umc_ecc_log;
+
+	mutex_lock(&ecc_log->lock);
+	ret = radix_tree_insert(ecc_tree, ecc_err->hash_index, ecc_err);
+	if (!ret) {
+		struct ras_err_pages *err_pages = &ecc_err->err_pages;
+		int i;
+
+		/* Reserve memory */
+		for (i = 0; i < err_pages->count; i++)
+			amdgpu_ras_reserve_page(adev, err_pages->pfn[i]);
+
+		radix_tree_tag_set(ecc_tree,
+			ecc_err->hash_index, UMC_ECC_NEW_DETECTED_TAG);
+	}
+	mutex_unlock(&ecc_log->lock);
+
+	return ret;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 4f3834fa10a8..c83d24097c5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -52,6 +52,8 @@
 #define LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) \
 		LOOP_UMC_NODE_INST((node_inst)) LOOP_UMC_INST_AND_CH((umc_inst), (ch_inst))
 
+/* Page retirement tag */
+#define UMC_ECC_NEW_DETECTED_TAG       0x1
 
 typedef int (*umc_func)(struct amdgpu_device *adev, uint32_t node_inst,
 			uint32_t umc_inst, uint32_t ch_inst, void *data);
@@ -127,5 +129,8 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
 
 int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
 				uint64_t status, uint64_t ipid, uint64_t addr);
-
+int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
+		uint64_t *pfns, int len, uint64_t *val);
+int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
+		struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 085dcfe16b5e..6c2b61ef5b57 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -546,8 +546,10 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
 	uint16_t hwid, mcatype;
 	struct ta_ras_query_address_input addr_in;
 	uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
-	uint64_t err_addr;
+	uint64_t err_addr, hash_val = 0;
+	struct ras_ecc_err *ecc_err;
 	int count;
+	int ret;
 
 	hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
 	mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);
@@ -589,6 +591,43 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
 		return 0;
 	}
 
+	ret = amdgpu_umc_build_pages_hash(adev,
+			page_pfn, count, &hash_val);
+	if (ret) {
+		dev_err(adev->dev, "Fail to build error pages hash\n");
+		return ret;
+	}
+
+	ecc_err = kzalloc(sizeof(*ecc_err), GFP_KERNEL);
+	if (!ecc_err)
+		return -ENOMEM;
+
+	ecc_err->err_pages.pfn = kcalloc(count, sizeof(*ecc_err->err_pages.pfn), GFP_KERNEL);
+	if (!ecc_err->err_pages.pfn) {
+		kfree(ecc_err);
+		return -ENOMEM;
+	}
+
+	memcpy(ecc_err->err_pages.pfn, page_pfn, count * sizeof(*ecc_err->err_pages.pfn));
+	ecc_err->err_pages.count = count;
+
+	ecc_err->hash_index = hash_val;
+	ecc_err->status = status;
+	ecc_err->ipid = ipid;
+	ecc_err->addr = addr;
+
+	ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
+	if (ret) {
+		if (ret == -EEXIST)
+			con->umc_ecc_log.de_updated = true;
+		else
+			dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);
+
+		kfree(ecc_err->err_pages.pfn);
+		kfree(ecc_err);
+		return ret;
+	}
+
 	con->umc_ecc_log.de_updated = true;
 
 	return 0;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 08/15] drm/amdgpu: Add delay work to retire bad pages
  2024-04-18  2:58 [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page YiPeng Chai
                   ` (5 preceding siblings ...)
  2024-04-18  2:58 ` [PATCH 07/15] drm/amdgpu: umc v12_0 logs ecc errors YiPeng Chai
@ 2024-04-18  2:58 ` YiPeng Chai
  2024-04-18  2:58 ` [PATCH 09/15] drm/amdgpu: add condition check for amdgpu_umc_fill_error_record YiPeng Chai
                   ` (7 subsequent siblings)
  14 siblings, 0 replies; 29+ messages in thread
From: YiPeng Chai @ 2024-04-18  2:58 UTC (permalink / raw)
  To: amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang, YiPeng Chai

Add delay work to retire bad pages.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 36 ++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  3 +++
 4 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 702229abe7ee..c1f146d3e28d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
 
 #define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms
 
+#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms
+
 enum amdgpu_ras_retire_page_reservation {
 	AMDGPU_RAS_RETIRE_PAGE_RESERVED,
 	AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -2766,6 +2768,30 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
 	ecc_log->de_updated = false;
 }
 
+static void amdgpu_ras_do_page_retirement(struct work_struct *work)
+{
+	struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
+					      page_retirement_dwork.work);
+	struct amdgpu_device *adev = con->adev;
+	struct ras_err_data err_data;
+
+	if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery))
+		return;
+
+	amdgpu_ras_error_data_init(&err_data);
+
+	amdgpu_umc_handle_bad_pages(adev, &err_data);
+
+	amdgpu_ras_error_data_fini(&err_data);
+
+	mutex_lock(&con->umc_ecc_log.lock);
+	if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
+				UMC_ECC_NEW_DETECTED_TAG))
+		schedule_delayed_work(&con->page_retirement_dwork,
+			msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL));
+	mutex_unlock(&con->umc_ecc_log.lock);
+}
+
 static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
 			enum amdgpu_ras_block ras_block, uint32_t timeout_ms)
 {
@@ -2804,7 +2830,12 @@ static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
 static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
 					uint32_t timeout)
 {
-	amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout);
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	int ret;
+
+	ret = amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout);
+	if (!ret)
+		schedule_delayed_work(&con->page_retirement_dwork, 0);
 }
 
 static int amdgpu_ras_page_retirement_thread(void *param)
@@ -2919,6 +2950,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 		dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n");
 	}
 
+	INIT_DELAYED_WORK(&con->page_retirement_dwork, amdgpu_ras_do_page_retirement);
 	amdgpu_ras_ecc_log_init(&con->umc_ecc_log);
 #ifdef CONFIG_X86_MCE_AMD
 #ifdef HAVE_SMCA_UMC_V2
@@ -2967,6 +2999,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
 
 	cancel_work_sync(&con->recovery_work);
 
+	cancel_delayed_work_sync(&con->page_retirement_dwork);
+
 	amdgpu_ras_ecc_log_fini(&con->umc_ecc_log);
 
 	mutex_lock(&con->recovery_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 634654cf2634..cb5a0f31d201 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -537,6 +537,7 @@ struct amdgpu_ras {
 	struct mutex page_rsv_lock;
 	DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
 	struct ras_ecc_log_info  umc_ecc_log;
+	struct delayed_work page_retirement_dwork;
 
 	/* Fatal error detected flag */
 	atomic_t fed;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 0f2d765c4e2d..2bd88218c20e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -89,7 +89,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
 	return ret;
 }
 
-static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
+void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
 			void *ras_error_status)
 {
 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index c83d24097c5c..2d08d076f7c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -133,4 +133,7 @@ int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
 		uint64_t *pfns, int len, uint64_t *val);
 int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
 		struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err);
+
+void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
+			void *ras_error_status);
 #endif
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 09/15] drm/amdgpu: add condition check for amdgpu_umc_fill_error_record
  2024-04-18  2:58 [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page YiPeng Chai
                   ` (6 preceding siblings ...)
  2024-04-18  2:58 ` [PATCH 08/15] drm/amdgpu: Add delay work to retire bad pages YiPeng Chai
@ 2024-04-18  2:58 ` YiPeng Chai
  2024-04-18  2:58 ` [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0 YiPeng Chai
                   ` (6 subsequent siblings)
  14 siblings, 0 replies; 29+ messages in thread
From: YiPeng Chai @ 2024-04-18  2:58 UTC (permalink / raw)
  To: amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang, YiPeng Chai

Add condition check for amdgpu_umc_fill_error_record.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 20 +++++++++++++++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  2 +-
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index cb5a0f31d201..c8980d5f6540 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -579,6 +579,7 @@ struct ras_err_data {
 	unsigned long de_count;
 	unsigned long err_addr_cnt;
 	struct eeprom_table_record *err_addr;
+	unsigned long err_addr_len;
 	u32 err_list_count;
 	struct list_head err_node_list;
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 2bd88218c20e..dcda3d24bee3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -66,6 +66,8 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
 		goto out_fini_err_data;
 	}
 
+	err_data.err_addr_len = adev->umc.max_ras_err_cnt_per_query;
+
 	/*
 	 * Translate UMC channel address to Physical address
 	 */
@@ -121,6 +123,8 @@ void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
 			if(!err_data->err_addr)
 				dev_warn(adev->dev, "Failed to alloc memory for "
 						"umc error address record!\n");
+			else
+				err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
 
 			/* umc query_ras_error_address is also responsible for clearing
 			 * error status
@@ -146,6 +150,8 @@ void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
 			if(!err_data->err_addr)
 				dev_warn(adev->dev, "Failed to alloc memory for "
 						"umc error address record!\n");
+			else
+				err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
 
 			/* umc query_ras_error_address is also responsible for clearing
 			 * error status
@@ -389,14 +395,20 @@ int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
 	return 0;
 }
 
-void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
+int amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
 		uint64_t err_addr,
 		uint64_t retired_page,
 		uint32_t channel_index,
 		uint32_t umc_inst)
 {
-	struct eeprom_table_record *err_rec =
-		&err_data->err_addr[err_data->err_addr_cnt];
+	struct eeprom_table_record *err_rec;
+
+	if (!err_data ||
+	    !err_data->err_addr ||
+	    (err_data->err_addr_cnt >= err_data->err_addr_len))
+		return -EINVAL;
+
+	err_rec = &err_data->err_addr[err_data->err_addr_cnt];
 
 	err_rec->address = err_addr;
 	/* page frame address is saved */
@@ -408,6 +420,8 @@ void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
 	err_rec->mcumc_id = umc_inst;
 
 	err_data->err_addr_cnt++;
+
+	return 0;
 }
 
 int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 2d08d076f7c9..9e77e6d48e3b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -109,7 +109,7 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
 		struct amdgpu_irq_src *source,
 		struct amdgpu_iv_entry *entry);
-void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
+int amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
 		uint64_t err_addr,
 		uint64_t retired_page,
 		uint32_t channel_index,
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0
  2024-04-18  2:58 [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page YiPeng Chai
                   ` (7 preceding siblings ...)
  2024-04-18  2:58 ` [PATCH 09/15] drm/amdgpu: add condition check for amdgpu_umc_fill_error_record YiPeng Chai
@ 2024-04-18  2:58 ` YiPeng Chai
  2024-04-22  8:14   ` Zhou1, Tao
  2024-04-18  2:58 ` [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption YiPeng Chai
                   ` (5 subsequent siblings)
  14 siblings, 1 reply; 29+ messages in thread
From: YiPeng Chai @ 2024-04-18  2:58 UTC (permalink / raw)
  To: amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang, YiPeng Chai

Retire bad pages for umc v12_0.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57 +++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6c2b61ef5b57..bd917eb6ea24 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -28,6 +28,8 @@
 #include "umc/umc_12_0_0_sh_mask.h"
 #include "mp/mp_13_0_6_sh_mask.h"
 
+#define MAX_ECC_NUM_PER_RETIREMENT  16
+
 static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
 					    uint32_t node_inst,
 					    uint32_t umc_inst,
@@ -633,6 +635,58 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
 	return 0;
 }
 
+static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
+				struct ras_ecc_err *ecc_err, void *ras_error_status)
+{
+	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+	uint32_t i = 0;
+	int ret = 0;
+
+	if (!err_data || !ecc_err)
+		return -EINVAL;
+
+	for (i = 0; i < ecc_err->err_pages.count; i++) {
+		ret = amdgpu_umc_fill_error_record(err_data,
+				ecc_err->addr,
+				ecc_err->err_pages.pfn[i] << AMDGPU_GPU_PAGE_SHIFT,
+				MCA_IPID_2_UMC_CH(ecc_err->ipid),
+				MCA_IPID_2_UMC_INST(ecc_err->ipid));
+		if (ret)
+			break;
+	}
+
+	err_data->de_count++;
+
+	return ret;
+}
+
+static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
+					void *ras_error_status)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
+	struct radix_tree_root *ecc_tree;
+	int new_detected, ret, i;
+
+	ecc_tree = &con->umc_ecc_log.de_page_tree;
+
+	mutex_lock(&con->umc_ecc_log.lock);
+	new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries,
+			0, ARRAY_SIZE(entries), UMC_ECC_NEW_DETECTED_TAG);
+	for (i = 0; i < new_detected; i++) {
+		if (!entries[i])
+			continue;
+
+		ret = umc_v12_0_fill_error_record(adev, entries[i], ras_error_status);
+		if (ret) {
+			dev_err(adev->dev, "Fail to fill umc error record, ret:%d\n", ret);
+			break;
+		}
+		radix_tree_tag_clear(ecc_tree, entries[i]->hash_index, UMC_ECC_NEW_DETECTED_TAG);
+	}
+	mutex_unlock(&con->umc_ecc_log.lock);
+}
+
 struct amdgpu_umc_ras umc_v12_0_ras = {
 	.ras_block = {
 		.hw_ops = &umc_v12_0_ras_hw_ops,
@@ -640,8 +694,7 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
 	},
 	.err_cnt_init = umc_v12_0_err_cnt_init,
 	.query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
-	.ecc_info_query_ras_error_count = umc_v12_0_ecc_info_query_ras_error_count,
-	.ecc_info_query_ras_error_address = umc_v12_0_ecc_info_query_ras_error_address,
+	.ecc_info_query_ras_error_address = umc_v12_0_query_ras_ecc_err_addr,
 	.check_ecc_err_status = umc_v12_0_check_ecc_err_status,
 	.update_ecc_status = umc_v12_0_update_ecc_status,
 };
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption
  2024-04-18  2:58 [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page YiPeng Chai
                   ` (8 preceding siblings ...)
  2024-04-18  2:58 ` [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0 YiPeng Chai
@ 2024-04-18  2:58 ` YiPeng Chai
  2024-04-25  3:00   ` Zhang, Hawking
  2024-04-18  2:58 ` [PATCH 12/15] drm/amdgpu: add poison consumption handler YiPeng Chai
                   ` (4 subsequent siblings)
  14 siblings, 1 reply; 29+ messages in thread
From: YiPeng Chai @ 2024-04-18  2:58 UTC (permalink / raw)
  To: amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang, YiPeng Chai

Prepare to handle pasid poison consumption.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  9 ++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  5 +++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c       | 20 ++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h       |  3 +++
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  3 ++-
 5 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 66753940bb4d..287ce431901c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -759,10 +759,17 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
 	return amdgpu_ras_get_fed_status(adev);
 }
 
+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
+				enum amdgpu_ras_block block, uint16_t pasid,
+				pasid_notify pasid_fn, void *data, uint32_t reset)
+{
+	amdgpu_umc_pasid_poison_handler(adev, block, pasid, pasid_fn, data, reset);
+}
+
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
 	enum amdgpu_ras_block block, uint32_t reset)
 {
-	amdgpu_umc_poison_handler(adev, block, reset);
+	amdgpu_umc_pasid_poison_handler(adev, block, 0, NULL, NULL, reset);
 }
 
 int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index ad50c7bbc326..54e15994d02b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -401,6 +401,11 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
 				struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
 			enum amdgpu_ras_block block, uint32_t reset);
+
+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
+			enum amdgpu_ras_block block, uint16_t pasid,
+			pasid_notify pasid_fn, void *data, uint32_t reset);
+
 bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
 bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index dcda3d24bee3..8ebbca9e2e22 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -252,8 +252,9 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
 	return 0;
 }
 
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-			enum amdgpu_ras_block block, uint32_t reset)
+int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
+			enum amdgpu_ras_block block, uint16_t pasid,
+			pasid_notify pasid_fn, void *data, uint32_t reset)
 {
 	int ret = AMDGPU_RAS_SUCCESS;
 
@@ -291,16 +292,14 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
 
 			amdgpu_ras_error_data_fini(&err_data);
 		} else {
-			if (reset) {
-				amdgpu_umc_bad_page_polling_timeout(adev,
-							reset, MAX_UMC_POISON_POLLING_TIME_SYNC);
-			} else {
 				struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
+				amdgpu_ras_put_poison_req(adev,
+					block, pasid, pasid_fn, data, reset);
+
 				atomic_inc(&con->page_retirement_req_cnt);
 
 				wake_up(&con->page_retirement_wq);
-			}
 		}
 	} else {
 		if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
@@ -313,6 +312,13 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
 	return ret;
 }
 
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
+			enum amdgpu_ras_block block, uint32_t reset)
+{
+	return amdgpu_umc_pasid_poison_handler(adev,
+				block, 0, NULL, NULL, reset);
+}
+
 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
 		void *ras_error_status,
 		struct amdgpu_iv_entry *entry)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 9e77e6d48e3b..5f50c69c3cec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -106,6 +106,9 @@ int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
 int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
 			enum amdgpu_ras_block block, uint32_t reset);
+int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
+			enum amdgpu_ras_block block, uint16_t pasid,
+			pasid_notify pasid_fn, void *data, uint32_t reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
 		struct amdgpu_irq_src *source,
 		struct amdgpu_iv_entry *entry);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index c368c70df3f4..6bf4bbc3cffa 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -201,7 +201,8 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
 			"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
 			client_id);
 
-	amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
+	amdgpu_amdkfd_ras_pasid_poison_consumption_handler(dev->adev,
+		block, pasid, NULL, NULL, reset);
 }
 
 static bool context_id_expected(struct kfd_dev *dev)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 12/15] drm/amdgpu: add poison consumption handler
  2024-04-18  2:58 [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page YiPeng Chai
                   ` (9 preceding siblings ...)
  2024-04-18  2:58 ` [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption YiPeng Chai
@ 2024-04-18  2:58 ` YiPeng Chai
  2024-04-18  2:58 ` [PATCH 13/15] drm/amdgpu: support ACA logging ecc errors YiPeng Chai
                   ` (3 subsequent siblings)
  14 siblings, 0 replies; 29+ messages in thread
From: YiPeng Chai @ 2024-04-18  2:58 UTC (permalink / raw)
  To: amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang, YiPeng Chai

Add poison consumption handler.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 ++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c1f146d3e28d..3f34b3abbd79 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2838,12 +2838,35 @@ static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
 		schedule_delayed_work(&con->page_retirement_dwork, 0);
 }
 
+static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
+			struct ras_poison_msg *poison_msg)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	uint32_t reset = poison_msg->reset;
+	uint16_t pasid = poison_msg->pasid;
+
+	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+
+	if (poison_msg->pasid_fn)
+		poison_msg->pasid_fn(adev, pasid, poison_msg->data);
+
+	if (reset) {
+		cancel_delayed_work_sync(&con->page_retirement_dwork);
+
+		con->gpu_reset_flags |= reset;
+		amdgpu_ras_reset_gpu(adev);
+	}
+
+	return 0;
+}
+
 static int amdgpu_ras_page_retirement_thread(void *param)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)param;
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_poison_msg poison_msg;
 	enum amdgpu_ras_block ras_block;
+	bool poison_creation_is_handled = false;
 
 	while (!kthread_should_stop()) {
 
@@ -2864,12 +2887,24 @@ static int amdgpu_ras_page_retirement_thread(void *param)
 		dev_info(adev->dev, "Start processing ras block %s(%d)\n",
 				ras_block_str(ras_block), ras_block);
 
-		if (ras_block == AMDGPU_RAS_BLOCK__UMC)
+		if (ras_block == AMDGPU_RAS_BLOCK__UMC) {
 			amdgpu_ras_poison_creation_handler(adev,
 				MAX_UMC_POISON_POLLING_TIME_ASYNC);
-		else
-			amdgpu_umc_bad_page_polling_timeout(adev,
-				false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
+			poison_creation_is_handled = true;
+		} else {
+			/* poison_creation_is_handled:
+			 *   false: no poison creation interrupt, but it has poison
+			 *          consumption interrupt.
+			 *   true: It has poison creation interrupt at the beginning,
+			 *         but it has no poison creation interrupt later.
+			 */
+			amdgpu_ras_poison_creation_handler(adev,
+					poison_creation_is_handled ?
+					0 : MAX_UMC_POISON_POLLING_TIME_ASYNC);
+
+			amdgpu_ras_poison_consumption_handler(adev, &poison_msg);
+			poison_creation_is_handled = false;
+		}
 	}
 
 	return 0;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 13/15] drm/amdgpu: support ACA logging ecc errors
  2024-04-18  2:58 [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page YiPeng Chai
                   ` (10 preceding siblings ...)
  2024-04-18  2:58 ` [PATCH 12/15] drm/amdgpu: add poison consumption handler YiPeng Chai
@ 2024-04-18  2:58 ` YiPeng Chai
  2024-04-18  2:58 ` [PATCH 14/15] drm/amdgpu: Fix address translation defect YiPeng Chai
                   ` (2 subsequent siblings)
  14 siblings, 0 replies; 29+ messages in thread
From: YiPeng Chai @ 2024-04-18  2:58 UTC (permalink / raw)
  To: amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang, YiPeng Chai

support ACA logging ecc errors.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index bd917eb6ea24..8df84feaf046 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -508,6 +508,11 @@ static int umc_v12_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank
 	if (ret)
 		return ret;
 
+	amdgpu_umc_update_ecc_status(adev,
+		bank->regs[ACA_REG_IDX_STATUS],
+		bank->regs[ACA_REG_IDX_IPID],
+		bank->regs[ACA_REG_IDX_ADDR]);
+
 	ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status);
 	count = ext_error_code == 0 ?
 		ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]) : 1ULL;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 14/15] drm/amdgpu: Fix address translation defect
  2024-04-18  2:58 [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page YiPeng Chai
                   ` (11 preceding siblings ...)
  2024-04-18  2:58 ` [PATCH 13/15] drm/amdgpu: support ACA logging ecc errors YiPeng Chai
@ 2024-04-18  2:58 ` YiPeng Chai
  2024-04-18  2:58 ` [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page YiPeng Chai
  2024-04-22  7:06 ` [PATCH 01/15] drm/amdgpu: Add " Christian König
  14 siblings, 0 replies; 29+ messages in thread
From: YiPeng Chai @ 2024-04-18  2:58 UTC (permalink / raw)
  To: amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang, YiPeng Chai

retired_page is page frame and should be expanded
to the full address when querying status.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 3f34b3abbd79..d1a2ab944b7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2403,7 +2403,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
 			.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
 		};
 		status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
-				data->bps[i].retired_page);
+				data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT);
 		if (status == -EBUSY)
 			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
 		else if (status == -ENOENT)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page
  2024-04-18  2:58 [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page YiPeng Chai
                   ` (12 preceding siblings ...)
  2024-04-18  2:58 ` [PATCH 14/15] drm/amdgpu: Fix address translation defect YiPeng Chai
@ 2024-04-18  2:58 ` YiPeng Chai
  2024-04-18  9:00   ` Christian König
  2024-04-22  2:25   ` Chai, Thomas
  2024-04-22  7:06 ` [PATCH 01/15] drm/amdgpu: Add " Christian König
  14 siblings, 2 replies; 29+ messages in thread
From: YiPeng Chai @ 2024-04-18  2:58 UTC (permalink / raw)
  To: amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang, YiPeng Chai

Use new interface to reserve bad page.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d1a2ab944b7d..dee66db10fa2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2548,9 +2548,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 			goto out;
 		}
 
-		amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
-			bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
-			AMDGPU_GPU_PAGE_SIZE);
+		amdgpu_ras_reserve_page(adev, bps[i].retired_page);
 
 		memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
 		data->count++;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page
  2024-04-18  2:58 ` [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page YiPeng Chai
@ 2024-04-18  9:00   ` Christian König
  2024-04-18  9:34     ` Chai, Thomas
  2024-04-22  2:25   ` Chai, Thomas
  1 sibling, 1 reply; 29+ messages in thread
From: Christian König @ 2024-04-18  9:00 UTC (permalink / raw)
  To: YiPeng Chai, amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang

Am 18.04.24 um 04:58 schrieb YiPeng Chai:
> Use new interface to reserve bad page.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 +---
>   1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index d1a2ab944b7d..dee66db10fa2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2548,9 +2548,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>   			goto out;
>   		}
>   
> -		amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
> -			bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
> -			AMDGPU_GPU_PAGE_SIZE);

Were is the call to reserve the VRAM range now moved?

Regards,
Christian.

> +		amdgpu_ras_reserve_page(adev, bps[i].retired_page);
>   
>   		memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
>   		data->count++;


^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page
  2024-04-18  9:00   ` Christian König
@ 2024-04-18  9:34     ` Chai, Thomas
  2024-04-22  8:27       ` Zhou1, Tao
  0 siblings, 1 reply; 29+ messages in thread
From: Chai, Thomas @ 2024-04-18  9:34 UTC (permalink / raw)
  To: Christian König, amd-gfx
  Cc: Zhang, Hawking, Zhou1, Tao, Li, Candice, Wang, Yang(Kevin),
	Yang, Stanley

[AMD Official Use Only - General]

-----------------
Best Regards,
Thomas

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com>
Sent: Thursday, April 18, 2024 5:01 PM
To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>
Subject: Re: [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page

Am 18.04.24 um 04:58 schrieb YiPeng Chai:
> Use new interface to reserve bad page.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 +---
>   1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index d1a2ab944b7d..dee66db10fa2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2548,9 +2548,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>                       goto out;
>               }
>
> -             amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
> -                     bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
> -                     AMDGPU_GPU_PAGE_SIZE);

> Were is the call to reserve the VRAM range now moved?

[Thomas] Called in amdgpu_ras_reserve_page,  amdgpu_ras_reserve_page  refer to " [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page "

Regards,
Christian.

> +             amdgpu_ras_reserve_page(adev, bps[i].retired_page);
>
>               memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
>               data->count++;


^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page
  2024-04-18  2:58 ` [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page YiPeng Chai
  2024-04-18  9:00   ` Christian König
@ 2024-04-22  2:25   ` Chai, Thomas
  1 sibling, 0 replies; 29+ messages in thread
From: Chai, Thomas @ 2024-04-22  2:25 UTC (permalink / raw)
  To: amd-gfx
  Cc: Zhang, Hawking, Zhou1, Tao, Li, Candice, Wang, Yang(Kevin),
	Yang, Stanley

[AMD Official Use Only - General]

Ping ....


-----------------
Best Regards,
Thomas

-----Original Message-----
From: Chai, Thomas <YiPeng.Chai@amd.com>
Sent: Thursday, April 18, 2024 10:59 AM
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Subject: [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page

Use new interface to reserve bad page.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d1a2ab944b7d..dee66db10fa2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2548,9 +2548,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
                        goto out;
                }

-               amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
-                       bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
-                       AMDGPU_GPU_PAGE_SIZE);
+               amdgpu_ras_reserve_page(adev, bps[i].retired_page);

                memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
                data->count++;
--
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page
  2024-04-18  2:58 [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page YiPeng Chai
                   ` (13 preceding siblings ...)
  2024-04-18  2:58 ` [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page YiPeng Chai
@ 2024-04-22  7:06 ` Christian König
  14 siblings, 0 replies; 29+ messages in thread
From: Christian König @ 2024-04-22  7:06 UTC (permalink / raw)
  To: YiPeng Chai, amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang

Am 18.04.24 um 04:58 schrieb YiPeng Chai:
> Add interface to reserve bad page.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>

Yeah, that approach looks valid to me. Just keep in mind that 
amdgpu_vram_mgr_query_page_status() is not the fastest function cause it 
does a linear search.

Apart from that Reviewed-by: Christian König <christian.koenig@amd.com> 
for this patch, but can't really judge the rest of the patch set.

Regards,
Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 19 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  4 ++++
>   2 files changed, 23 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 2c97cb80d79a..05782d68f073 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2782,6 +2782,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
>   		}
>   	}
>   
> +	mutex_init(&con->page_rsv_lock);
>   	mutex_init(&con->page_retirement_lock);
>   	init_waitqueue_head(&con->page_retirement_wq);
>   	atomic_set(&con->page_retirement_req_cnt, 0);
> @@ -2835,6 +2836,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
>   
>   	atomic_set(&con->page_retirement_req_cnt, 0);
>   
> +	mutex_destroy(&con->page_rsv_lock);
> +
>   	cancel_work_sync(&con->recovery_work);
>   
>   	mutex_lock(&con->recovery_lock);
> @@ -4278,3 +4281,19 @@ void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances)
>   			amdgpu_ras_boot_time_error_reporting(adev, i, boot_error);
>   	}
>   }
> +
> +int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn)
> +{
> +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +	struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
> +	uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT;
> +	int ret = 0;
> +
> +	mutex_lock(&con->page_rsv_lock);
> +	ret = amdgpu_vram_mgr_query_page_status(mgr, start);
> +	if (ret == -ENOENT)
> +		ret = amdgpu_vram_mgr_reserve_range(mgr, start, AMDGPU_GPU_PAGE_SIZE);
> +	mutex_unlock(&con->page_rsv_lock);
> +
> +	return ret;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 8d26989c75c8..ab5bf573378e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -500,6 +500,7 @@ struct amdgpu_ras {
>   	wait_queue_head_t page_retirement_wq;
>   	struct mutex page_retirement_lock;
>   	atomic_t page_retirement_req_cnt;
> +	struct mutex page_rsv_lock;
>   	/* Fatal error detected flag */
>   	atomic_t fed;
>   
> @@ -909,4 +910,7 @@ bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
>   
>   bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id);
>   u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type);
> +
> +int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn);
> +
>   #endif


^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0
  2024-04-18  2:58 ` [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0 YiPeng Chai
@ 2024-04-22  8:14   ` Zhou1, Tao
  2024-04-22  9:21     ` Chai, Thomas
  0 siblings, 1 reply; 29+ messages in thread
From: Zhou1, Tao @ 2024-04-22  8:14 UTC (permalink / raw)
  To: Chai, Thomas, amd-gfx
  Cc: Zhang, Hawking, Li, Candice, Wang, Yang(Kevin), Yang, Stanley

[AMD Official Use Only - General]

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Thursday, April 18, 2024 10:59 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice
> <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang,
> Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
> Subject: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0
>
> Retire bad pages for umc v12_0.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57
> +++++++++++++++++++++++++-
>  1 file changed, 55 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> index 6c2b61ef5b57..bd917eb6ea24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> @@ -28,6 +28,8 @@
>  #include "umc/umc_12_0_0_sh_mask.h"
>  #include "mp/mp_13_0_6_sh_mask.h"
>
> +#define MAX_ECC_NUM_PER_RETIREMENT  16

[Tao] we already have UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL for the purposes

> +
>  static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
>                                           uint32_t node_inst,
>                                           uint32_t umc_inst,
> @@ -633,6 +635,58 @@ static int umc_v12_0_update_ecc_status(struct
> amdgpu_device *adev,
>       return 0;
>  }
>
> +static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
> +                             struct ras_ecc_err *ecc_err, void
> *ras_error_status) {
> +     struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
> +     uint32_t i = 0;
> +     int ret = 0;
> +
> +     if (!err_data || !ecc_err)
> +             return -EINVAL;
> +
> +     for (i = 0; i < ecc_err->err_pages.count; i++) {
> +             ret = amdgpu_umc_fill_error_record(err_data,
> +                             ecc_err->addr,
> +                             ecc_err->err_pages.pfn[i] <<
> AMDGPU_GPU_PAGE_SHIFT,
> +                             MCA_IPID_2_UMC_CH(ecc_err->ipid),
> +                             MCA_IPID_2_UMC_INST(ecc_err->ipid));
> +             if (ret)
> +                     break;
> +     }
> +
> +     err_data->de_count++;
> +
> +     return ret;
> +}
> +
> +static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
> +                                     void *ras_error_status)
> +{
> +     struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +     struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
> +     struct radix_tree_root *ecc_tree;
> +     int new_detected, ret, i;
> +
> +     ecc_tree = &con->umc_ecc_log.de_page_tree;
> +
> +     mutex_lock(&con->umc_ecc_log.lock);
> +     new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries,
> +                     0, ARRAY_SIZE(entries),
> UMC_ECC_NEW_DETECTED_TAG);
> +     for (i = 0; i < new_detected; i++) {
> +             if (!entries[i])
> +                     continue;
> +
> +             ret = umc_v12_0_fill_error_record(adev, entries[i],
> ras_error_status);
> +             if (ret) {
> +                     dev_err(adev->dev, "Fail to fill umc error record,
> ret:%d\n", ret);
> +                     break;
> +             }
> +             radix_tree_tag_clear(ecc_tree, entries[i]->hash_index,
> UMC_ECC_NEW_DETECTED_TAG);
> +     }
> +     mutex_unlock(&con->umc_ecc_log.lock);
> +}
> +
>  struct amdgpu_umc_ras umc_v12_0_ras = {
>       .ras_block = {
>               .hw_ops = &umc_v12_0_ras_hw_ops,
> @@ -640,8 +694,7 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
>       },
>       .err_cnt_init = umc_v12_0_err_cnt_init,
>       .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
> -     .ecc_info_query_ras_error_count =
> umc_v12_0_ecc_info_query_ras_error_count,
> -     .ecc_info_query_ras_error_address =
> umc_v12_0_ecc_info_query_ras_error_address,
> +     .ecc_info_query_ras_error_address =
> umc_v12_0_query_ras_ecc_err_addr,
>       .check_ecc_err_status = umc_v12_0_check_ecc_err_status,
>       .update_ecc_status = umc_v12_0_update_ecc_status,  };
> --
> 2.34.1


^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page
  2024-04-18  9:34     ` Chai, Thomas
@ 2024-04-22  8:27       ` Zhou1, Tao
  0 siblings, 0 replies; 29+ messages in thread
From: Zhou1, Tao @ 2024-04-22  8:27 UTC (permalink / raw)
  To: Chai, Thomas, Christian König, amd-gfx
  Cc: Zhang, Hawking, Li, Candice, Wang, Yang(Kevin), Yang, Stanley

[AMD Official Use Only - General]

With my concern fixed, the series is:

Reviewed-by: Tao Zhou <tao.zhou1@amd.com>

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Thursday, April 18, 2024 5:35 PM
> To: Christian König <ckoenig.leichtzumerken@gmail.com>; amd-
> gfx@lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao
> <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin)
> <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>
> Subject: RE: [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page
>
> [AMD Official Use Only - General]
>
> -----------------
> Best Regards,
> Thomas
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Thursday, April 18, 2024 5:01 PM
> To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice
> <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang,
> Stanley <Stanley.Yang@amd.com>
> Subject: Re: [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page
>
> Am 18.04.24 um 04:58 schrieb YiPeng Chai:
> > Use new interface to reserve bad page.
> >
> > Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 +---
> >   1 file changed, 1 insertion(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > index d1a2ab944b7d..dee66db10fa2 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > @@ -2548,9 +2548,7 @@ int amdgpu_ras_add_bad_pages(struct
> amdgpu_device *adev,
> >                       goto out;
> >               }
> >
> > -             amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
> > -                     bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
> > -                     AMDGPU_GPU_PAGE_SIZE);
>
> > Were is the call to reserve the VRAM range now moved?
>
> [Thomas] Called in amdgpu_ras_reserve_page,  amdgpu_ras_reserve_page  refer
> to " [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page "
>
> Regards,
> Christian.
>
> > +             amdgpu_ras_reserve_page(adev, bps[i].retired_page);
> >
> >               memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
> >               data->count++;
>


^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0
  2024-04-22  8:14   ` Zhou1, Tao
@ 2024-04-22  9:21     ` Chai, Thomas
  2024-04-22  9:33       ` Chai, Thomas
  0 siblings, 1 reply; 29+ messages in thread
From: Chai, Thomas @ 2024-04-22  9:21 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx
  Cc: Zhang, Hawking, Li, Candice, Wang, Yang(Kevin), Yang, Stanley

[AMD Official Use Only - General]

-----------------
Best Regards,
Thomas

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Monday, April 22, 2024 4:14 PM
To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>
Subject: RE: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0

[AMD Official Use Only - General]

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Thursday, April 18, 2024 10:59 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice
> <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>;
> Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas
> <YiPeng.Chai@amd.com>
> Subject: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0
>
> Retire bad pages for umc v12_0.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57
> +++++++++++++++++++++++++-
>  1 file changed, 55 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> index 6c2b61ef5b57..bd917eb6ea24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> @@ -28,6 +28,8 @@
>  #include "umc/umc_12_0_0_sh_mask.h"
>  #include "mp/mp_13_0_6_sh_mask.h"
>
> +#define MAX_ECC_NUM_PER_RETIREMENT  16

> [Tao] we already have UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL for the purposes

[Thomas]  This is from eeprom view to define the maximum number of eeprom table entries written each time.
The actual data items that need to be written to eeprom may be less than this value, or may be more than this value(e.g.  later, write the data items before address conversion).
If the value is less than this value, it will be written according to the actual value, and if it is more than this value, it will be written in multiple times.


> +
>  static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
>                                           uint32_t node_inst,
>                                           uint32_t umc_inst, @@ -633,6
> +635,58 @@ static int umc_v12_0_update_ecc_status(struct
> amdgpu_device *adev,
>       return 0;
>  }
>
> +static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
> +                             struct ras_ecc_err *ecc_err, void
> *ras_error_status) {
> +     struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
> +     uint32_t i = 0;
> +     int ret = 0;
> +
> +     if (!err_data || !ecc_err)
> +             return -EINVAL;
> +
> +     for (i = 0; i < ecc_err->err_pages.count; i++) {
> +             ret = amdgpu_umc_fill_error_record(err_data,
> +                             ecc_err->addr,
> +                             ecc_err->err_pages.pfn[i] <<
> AMDGPU_GPU_PAGE_SHIFT,
> +                             MCA_IPID_2_UMC_CH(ecc_err->ipid),
> +                             MCA_IPID_2_UMC_INST(ecc_err->ipid));
> +             if (ret)
> +                     break;
> +     }
> +
> +     err_data->de_count++;
> +
> +     return ret;
> +}
> +
> +static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
> +                                     void *ras_error_status) {
> +     struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +     struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
> +     struct radix_tree_root *ecc_tree;
> +     int new_detected, ret, i;
> +
> +     ecc_tree = &con->umc_ecc_log.de_page_tree;
> +
> +     mutex_lock(&con->umc_ecc_log.lock);
> +     new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries,
> +                     0, ARRAY_SIZE(entries),
> UMC_ECC_NEW_DETECTED_TAG);
> +     for (i = 0; i < new_detected; i++) {
> +             if (!entries[i])
> +                     continue;
> +
> +             ret = umc_v12_0_fill_error_record(adev, entries[i],
> ras_error_status);
> +             if (ret) {
> +                     dev_err(adev->dev, "Fail to fill umc error
> + record,
> ret:%d\n", ret);
> +                     break;
> +             }
> +             radix_tree_tag_clear(ecc_tree, entries[i]->hash_index,
> UMC_ECC_NEW_DETECTED_TAG);
> +     }
> +     mutex_unlock(&con->umc_ecc_log.lock);
> +}
> +
>  struct amdgpu_umc_ras umc_v12_0_ras = {
>       .ras_block = {
>               .hw_ops = &umc_v12_0_ras_hw_ops, @@ -640,8 +694,7 @@
> struct amdgpu_umc_ras umc_v12_0_ras = {
>       },
>       .err_cnt_init = umc_v12_0_err_cnt_init,
>       .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
> -     .ecc_info_query_ras_error_count =
> umc_v12_0_ecc_info_query_ras_error_count,
> -     .ecc_info_query_ras_error_address =
> umc_v12_0_ecc_info_query_ras_error_address,
> +     .ecc_info_query_ras_error_address =
> umc_v12_0_query_ras_ecc_err_addr,
>       .check_ecc_err_status = umc_v12_0_check_ecc_err_status,
>       .update_ecc_status = umc_v12_0_update_ecc_status,  };
> --
> 2.34.1



^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0
  2024-04-22  9:21     ` Chai, Thomas
@ 2024-04-22  9:33       ` Chai, Thomas
  0 siblings, 0 replies; 29+ messages in thread
From: Chai, Thomas @ 2024-04-22  9:33 UTC (permalink / raw)
  To: Chai, Thomas, Zhou1, Tao, amd-gfx
  Cc: Zhang, Hawking, Li, Candice, Wang, Yang(Kevin), Yang, Stanley

[AMD Official Use Only - General]

update

-----------------
Best Regards,
Thomas

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Chai, Thomas
Sent: Monday, April 22, 2024 5:21 PM
To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>
Subject: RE: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0

[AMD Official Use Only - General]

[AMD Official Use Only - General]

-----------------
Best Regards,
Thomas

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Monday, April 22, 2024 4:14 PM
To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>
Subject: RE: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0

[AMD Official Use Only - General]

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Thursday, April 18, 2024 10:59 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice
> <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>;
> Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas
> <YiPeng.Chai@amd.com>
> Subject: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0
>
> Retire bad pages for umc v12_0.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57
> +++++++++++++++++++++++++-
>  1 file changed, 55 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> index 6c2b61ef5b57..bd917eb6ea24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> @@ -28,6 +28,8 @@
>  #include "umc/umc_12_0_0_sh_mask.h"
>  #include "mp/mp_13_0_6_sh_mask.h"
>
> +#define MAX_ECC_NUM_PER_RETIREMENT  16

> [Tao] we already have UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL for the
> purposes

[Thomas]  This is from eeprom view to define the maximum number of eeprom table entries written each time.
The actual data items that need to be written to eeprom may be less than this value(e.g.  later, write the data items before address conversion), or may be more than this value.
If the value is less than this value, it will be written according to the actual value, and if it is more than this value, it will be written in multiple times.


> +
>  static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
>                                           uint32_t node_inst,
>                                           uint32_t umc_inst, @@ -633,6
> +635,58 @@ static int umc_v12_0_update_ecc_status(struct
> amdgpu_device *adev,
>       return 0;
>  }
>
> +static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
> +                             struct ras_ecc_err *ecc_err, void
> *ras_error_status) {
> +     struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
> +     uint32_t i = 0;
> +     int ret = 0;
> +
> +     if (!err_data || !ecc_err)
> +             return -EINVAL;
> +
> +     for (i = 0; i < ecc_err->err_pages.count; i++) {
> +             ret = amdgpu_umc_fill_error_record(err_data,
> +                             ecc_err->addr,
> +                             ecc_err->err_pages.pfn[i] <<
> AMDGPU_GPU_PAGE_SHIFT,
> +                             MCA_IPID_2_UMC_CH(ecc_err->ipid),
> +                             MCA_IPID_2_UMC_INST(ecc_err->ipid));
> +             if (ret)
> +                     break;
> +     }
> +
> +     err_data->de_count++;
> +
> +     return ret;
> +}
> +
> +static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
> +                                     void *ras_error_status) {
> +     struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +     struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
> +     struct radix_tree_root *ecc_tree;
> +     int new_detected, ret, i;
> +
> +     ecc_tree = &con->umc_ecc_log.de_page_tree;
> +
> +     mutex_lock(&con->umc_ecc_log.lock);
> +     new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries,
> +                     0, ARRAY_SIZE(entries),
> UMC_ECC_NEW_DETECTED_TAG);
> +     for (i = 0; i < new_detected; i++) {
> +             if (!entries[i])
> +                     continue;
> +
> +             ret = umc_v12_0_fill_error_record(adev, entries[i],
> ras_error_status);
> +             if (ret) {
> +                     dev_err(adev->dev, "Fail to fill umc error
> + record,
> ret:%d\n", ret);
> +                     break;
> +             }
> +             radix_tree_tag_clear(ecc_tree, entries[i]->hash_index,
> UMC_ECC_NEW_DETECTED_TAG);
> +     }
> +     mutex_unlock(&con->umc_ecc_log.lock);
> +}
> +
>  struct amdgpu_umc_ras umc_v12_0_ras = {
>       .ras_block = {
>               .hw_ops = &umc_v12_0_ras_hw_ops, @@ -640,8 +694,7 @@
> struct amdgpu_umc_ras umc_v12_0_ras = {
>       },
>       .err_cnt_init = umc_v12_0_err_cnt_init,
>       .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
> -     .ecc_info_query_ras_error_count =
> umc_v12_0_ecc_info_query_ras_error_count,
> -     .ecc_info_query_ras_error_address =
> umc_v12_0_ecc_info_query_ras_error_address,
> +     .ecc_info_query_ras_error_address =
> umc_v12_0_query_ras_ecc_err_addr,
>       .check_ecc_err_status = umc_v12_0_check_ecc_err_status,
>       .update_ecc_status = umc_v12_0_update_ecc_status,  };
> --
> 2.34.1



^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 04/15] drm/amdgpu: add poison creation handler
  2024-04-18  2:58 ` [PATCH 04/15] drm/amdgpu: add poison creation handler YiPeng Chai
@ 2024-04-25  2:32   ` Zhang, Hawking
  2024-04-25  3:35     ` Chai, Thomas
  0 siblings, 1 reply; 29+ messages in thread
From: Zhang, Hawking @ 2024-04-25  2:32 UTC (permalink / raw)
  To: Chai, Thomas, amd-gfx
  Cc: Chai, Thomas, Zhou1, Tao, Li, Candice, Wang, Yang(Kevin),
	Yang, Stanley, Chai, Thomas

[AMD Official Use Only - General]

Is it okay to drop below static function and just implement the logic in poison creation handler leveraging the ras query api: amdgpu_ras_query_error_status.

It seems to me the static function may not be able to be used for other IP blocks.

Regards,
Hawking

+ static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block ras_block, uint32_t timeout_ms) {
+       int ret = 0;
+       struct ras_ecc_log_info *ecc_log;
+       struct ras_query_if info;
+       uint32_t timeout = timeout_ms;
+       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+       memset(&info, 0, sizeof(info));
+       info.head.block = ras_block;
+
+       ecc_log = &ras->umc_ecc_log;
+       ecc_log->de_updated = false;
+       do {
+               ret = amdgpu_ras_query_error_status(adev, &info);
+               if (ret) {
+                       dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret);
+                       return ret;
+               }
+
+               if (timeout && !ecc_log->de_updated) {
+                       msleep(1);
+                       timeout--;
+               }
+       } while (timeout && !ecc_log->de_updated);
+
+       if (timeout_ms && !timeout) {
+               dev_warn(adev->dev, "Can't find deferred error\n");
+               return -ETIMEDOUT;
+       }
+
+       return 0;
+}
+
+static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
+                                       uint32_t timeout)
+{
+       amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout); }
+

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of YiPeng Chai
Sent: Thursday, April 18, 2024 10:58
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Subject: [PATCH 04/15] drm/amdgpu: add poison creation handler

Add poison creation handler.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 74 +++++++++++++++++++++++--
 1 file changed, 69 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 64e6e20c6de7..126616eaeec1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2080,6 +2080,17 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj  {
        dev_info(obj->adev->dev,
                "Poison is created\n");
+
+       if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
+               struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev);
+
+               amdgpu_ras_put_poison_req(obj->adev,
+                       AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false);
+
+               atomic_inc(&con->page_retirement_req_cnt);
+
+               wake_up(&con->page_retirement_wq);
+       }
 }

 static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, @@ -2754,10 +2765,54 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
        mutex_destroy(&ecc_log->lock);
        ecc_log->de_updated = false;
 }
+
+static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block ras_block, uint32_t timeout_ms) {
+       int ret = 0;
+       struct ras_ecc_log_info *ecc_log;
+       struct ras_query_if info;
+       uint32_t timeout = timeout_ms;
+       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+       memset(&info, 0, sizeof(info));
+       info.head.block = ras_block;
+
+       ecc_log = &ras->umc_ecc_log;
+       ecc_log->de_updated = false;
+       do {
+               ret = amdgpu_ras_query_error_status(adev, &info);
+               if (ret) {
+                       dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret);
+                       return ret;
+               }
+
+               if (timeout && !ecc_log->de_updated) {
+                       msleep(1);
+                       timeout--;
+               }
+       } while (timeout && !ecc_log->de_updated);
+
+       if (timeout_ms && !timeout) {
+               dev_warn(adev->dev, "Can't find deferred error\n");
+               return -ETIMEDOUT;
+       }
+
+       return 0;
+}
+
+static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
+                                       uint32_t timeout)
+{
+       amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout); }
+
 static int amdgpu_ras_page_retirement_thread(void *param)  {
        struct amdgpu_device *adev = (struct amdgpu_device *)param;
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct ras_poison_msg poison_msg;
+       enum amdgpu_ras_block ras_block;

        while (!kthread_should_stop()) {

@@ -2768,13 +2823,22 @@ static int amdgpu_ras_page_retirement_thread(void *param)
                if (kthread_should_stop())
                        break;

-               dev_info(adev->dev, "Start processing page retirement. request:%d\n",
-                       atomic_read(&con->page_retirement_req_cnt));
-
                atomic_dec(&con->page_retirement_req_cnt);

-               amdgpu_umc_bad_page_polling_timeout(adev,
-                               0, MAX_UMC_POISON_POLLING_TIME_ASYNC);
+               if (!amdgpu_ras_get_poison_req(adev, &poison_msg))
+                       continue;
+
+               ras_block = poison_msg.block;
+
+               dev_info(adev->dev, "Start processing ras block %s(%d)\n",
+                               ras_block_str(ras_block), ras_block);
+
+               if (ras_block == AMDGPU_RAS_BLOCK__UMC)
+                       amdgpu_ras_poison_creation_handler(adev,
+                               MAX_UMC_POISON_POLLING_TIME_ASYNC);
+               else
+                       amdgpu_umc_bad_page_polling_timeout(adev,
+                               false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
        }

        return 0;
--
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* RE: [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption
  2024-04-18  2:58 ` [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption YiPeng Chai
@ 2024-04-25  3:00   ` Zhang, Hawking
  2024-04-25  3:17     ` Chai, Thomas
  0 siblings, 1 reply; 29+ messages in thread
From: Zhang, Hawking @ 2024-04-25  3:00 UTC (permalink / raw)
  To: Chai, Thomas, amd-gfx
  Cc: Zhou1, Tao, Li, Candice, Wang, Yang(Kevin), Yang, Stanley

[AMD Official Use Only - General]

+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, uint16_t pasid,
+                       pasid_notify pasid_fn, void *data, uint32_t reset);

So we ultimately switch to above poison consumption handler for all the existing v9 adapters, right? If so, we shall be able to make this function backwards compatible. I'm wondering if we can just change the existing amdgpu_amdkfd_ras_poison_consumption_handler.

Pasid_poison_consumption_handler is a little bit confusing.

Regards,
Hawking

-----Original Message-----
From: Chai, Thomas <YiPeng.Chai@amd.com>
Sent: Thursday, April 18, 2024 10:59
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Subject: [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption

Prepare to handle pasid poison consumption.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  9 ++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  5 +++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c       | 20 ++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h       |  3 +++
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  3 ++-
 5 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 66753940bb4d..287ce431901c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -759,10 +759,17 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
        return amdgpu_ras_get_fed_status(adev);  }

+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
+                               enum amdgpu_ras_block block, uint16_t pasid,
+                               pasid_notify pasid_fn, void *data, uint32_t reset) {
+       amdgpu_umc_pasid_poison_handler(adev, block, pasid, pasid_fn, data,
+reset); }
+
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
        enum amdgpu_ras_block block, uint32_t reset)  {
-       amdgpu_umc_poison_handler(adev, block, reset);
+       amdgpu_umc_pasid_poison_handler(adev, block, 0, NULL, NULL, reset);
 }

 int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index ad50c7bbc326..54e15994d02b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -401,6 +401,11 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
                                struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
                        enum amdgpu_ras_block block, uint32_t reset);
+
+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, uint16_t pasid,
+                       pasid_notify pasid_fn, void *data, uint32_t reset);
+
 bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);  bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);  void amdgpu_amdkfd_block_mmu_notifications(void *p); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index dcda3d24bee3..8ebbca9e2e22 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -252,8 +252,9 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
        return 0;
 }

-int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-                       enum amdgpu_ras_block block, uint32_t reset)
+int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, uint16_t pasid,
+                       pasid_notify pasid_fn, void *data, uint32_t reset)
 {
        int ret = AMDGPU_RAS_SUCCESS;

@@ -291,16 +292,14 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,

                        amdgpu_ras_error_data_fini(&err_data);
                } else {
-                       if (reset) {
-                               amdgpu_umc_bad_page_polling_timeout(adev,
-                                                       reset, MAX_UMC_POISON_POLLING_TIME_SYNC);
-                       } else {
                                struct amdgpu_ras *con = amdgpu_ras_get_context(adev);

+                               amdgpu_ras_put_poison_req(adev,
+                                       block, pasid, pasid_fn, data, reset);
+
                                atomic_inc(&con->page_retirement_req_cnt);

                                wake_up(&con->page_retirement_wq);
-                       }
                }
        } else {
                if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
@@ -313,6 +312,13 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
        return ret;
 }

+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, uint32_t reset) {
+       return amdgpu_umc_pasid_poison_handler(adev,
+                               block, 0, NULL, NULL, reset);
+}
+
 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
                void *ras_error_status,
                struct amdgpu_iv_entry *entry)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 9e77e6d48e3b..5f50c69c3cec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -106,6 +106,9 @@ int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);  int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);  int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
                        enum amdgpu_ras_block block, uint32_t reset);
+int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, uint16_t pasid,
+                       pasid_notify pasid_fn, void *data, uint32_t reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
                struct amdgpu_irq_src *source,
                struct amdgpu_iv_entry *entry);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index c368c70df3f4..6bf4bbc3cffa 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -201,7 +201,8 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
                        "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
                        client_id);

-       amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
+       amdgpu_amdkfd_ras_pasid_poison_consumption_handler(dev->adev,
+               block, pasid, NULL, NULL, reset);
 }

 static bool context_id_expected(struct kfd_dev *dev)
--
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* RE: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address
  2024-04-18  2:58 ` [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address YiPeng Chai
@ 2024-04-25  3:02   ` Zhang, Hawking
  2024-04-25  3:31     ` Chai, Thomas
  0 siblings, 1 reply; 29+ messages in thread
From: Zhang, Hawking @ 2024-04-25  3:02 UTC (permalink / raw)
  To: Chai, Thomas, amd-gfx
  Cc: Zhou1, Tao, Li, Candice, Wang, Yang(Kevin), Yang, Stanley

[AMD Official Use Only - General]

I might lose some context here. Can you please elaborate why we don't leverage the existing umc_v12_0_convert_error_address implementation?

Regards,
Hawking

-----Original Message-----
From: Chai, Thomas <YiPeng.Chai@amd.com>
Sent: Thursday, April 18, 2024 10:58
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Subject: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address

Umc v12_0 converts error address.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 94 +++++++++++++++++++++++++-  drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 12 ++++
 2 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 81435533c4a7..085dcfe16b5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -222,6 +222,66 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
        }
 }

+static int umc_v12_0_convert_err_addr(struct amdgpu_device *adev,
+                               struct ta_ras_query_address_input *addr_in,
+                               uint64_t *pfns, int len)
+{
+       uint32_t col, row, row_xor, bank, channel_index;
+       uint64_t soc_pa, retired_page, column, err_addr;
+       struct ta_ras_query_address_output addr_out;
+       uint32_t pos = 0;
+
+       err_addr = addr_in->ma.err_addr;
+       addr_in->addr_type = TA_RAS_MCA_TO_PA;
+       if (psp_ras_query_address(&adev->psp, addr_in, &addr_out)) {
+               dev_warn(adev->dev, "Failed to query RAS physical address for 0x%llx",
+                       err_addr);
+               return 0;
+       }
+
+       soc_pa = addr_out.pa.pa;
+       bank = addr_out.pa.bank;
+       channel_index = addr_out.pa.channel_idx;
+
+       col = (err_addr >> 1) & 0x1fULL;
+       row = (err_addr >> 10) & 0x3fffULL;
+       row_xor = row ^ (0x1ULL << 13);
+       /* clear [C3 C2] in soc physical address */
+       soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
+       /* clear [C4] in soc physical address */
+       soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
+
+       /* loop for all possibilities of [C4 C3 C2] */
+       for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
+               retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
+               retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT);
+
+               if (pos >= len)
+                       return 0;
+               pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+
+               /* include column bit 0 and 1 */
+               col &= 0x3;
+               col |= (column << 2);
+               dev_info(adev->dev,
+                       "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
+                       retired_page, row, col, bank, channel_index);
+
+               /* shift R13 bit */
+               retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
+
+               if (pos >= len)
+                       return 0;
+               pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+
+               dev_info(adev->dev,
+                       "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
+                       retired_page, row_xor, col, bank, channel_index);
+       }
+
+       return pos;
+}
+
 static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
                                        uint32_t node_inst, uint32_t umc_inst,
                                        uint32_t ch_inst, void *data)
@@ -482,8 +542,12 @@ static int umc_v12_0_ras_late_init(struct amdgpu_device *adev, struct ras_common  static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
                        uint64_t status, uint64_t ipid, uint64_t addr)  {
-       uint16_t hwid, mcatype;
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       uint16_t hwid, mcatype;
+       struct ta_ras_query_address_input addr_in;
+       uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
+       uint64_t err_addr;
+       int count;

        hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
        mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); @@ -497,6 +561,34 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
        if (!umc_v12_0_is_deferred_error(adev, status))
                return 0;

+       err_addr = REG_GET_FIELD(addr,
+                               MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
+
+       dev_info(adev->dev,
+               "UMC:IPID:0x%llx, socket:%llu, aid:%llu, inst:%llu, ch:%llu, err_addr:0x%llx\n",
+               ipid,
+               MCA_IPID_2_SOCKET_ID(ipid),
+               MCA_IPID_2_DIE_ID(ipid),
+               MCA_IPID_2_UMC_INST(ipid),
+               MCA_IPID_2_UMC_CH(ipid),
+               err_addr);
+
+       memset(page_pfn, 0, sizeof(page_pfn));
+
+       memset(&addr_in, 0, sizeof(addr_in));
+       addr_in.ma.err_addr = err_addr;
+       addr_in.ma.ch_inst = MCA_IPID_2_UMC_CH(ipid);
+       addr_in.ma.umc_inst = MCA_IPID_2_UMC_INST(ipid);
+       addr_in.ma.node_inst = MCA_IPID_2_DIE_ID(ipid);
+       addr_in.ma.socket_id = MCA_IPID_2_SOCKET_ID(ipid);
+
+       count = umc_v12_0_convert_err_addr(adev,
+                               &addr_in, page_pfn, ARRAY_SIZE(page_pfn));
+       if (count <= 0) {
+               dev_warn(adev->dev, "Fail to convert error address! count:%d\n", count);
+               return 0;
+       }
+
        con->umc_ecc_log.de_updated = true;

        return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
index 5c2d7e127608..b4974793850b 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
@@ -69,6 +69,18 @@
                        (((_ipid_lo) >> 12) & 0xF))
 #define MCA_IPID_LO_2_UMC_INST(_ipid_lo) (((_ipid_lo) >> 21) & 0x7)

+#define MCA_IPID_2_DIE_ID(ipid)  ((REG_GET_FIELD(ipid, MCMP1_IPIDT0,
+InstanceIdHi) >> 2) & 0x03)
+
+#define MCA_IPID_2_UMC_CH(ipid) \
+       (MCA_IPID_LO_2_UMC_CH(REG_GET_FIELD(ipid, MCMP1_IPIDT0,
+InstanceIdLo)))
+
+#define MCA_IPID_2_UMC_INST(ipid) \
+       (MCA_IPID_LO_2_UMC_INST(REG_GET_FIELD(ipid, MCMP1_IPIDT0,
+InstanceIdLo)))
+
+#define MCA_IPID_2_SOCKET_ID(ipid) \
+       (((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo) & 0x1) << 2) | \
+        (REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) & 0x03))
+
 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status);  bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);  bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
--
2.34.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* RE: [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption
  2024-04-25  3:00   ` Zhang, Hawking
@ 2024-04-25  3:17     ` Chai, Thomas
  0 siblings, 0 replies; 29+ messages in thread
From: Chai, Thomas @ 2024-04-25  3:17 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx
  Cc: Zhou1, Tao, Li, Candice, Wang, Yang(Kevin), Yang, Stanley

[AMD Official Use Only - General]

-----------------
Best Regards,
Thomas

-----Original Message-----
From: Zhang, Hawking <Hawking.Zhang@amd.com>
Sent: Thursday, April 25, 2024 11:01 AM
To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>
Subject: RE: [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption

[AMD Official Use Only - General]

+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, uint16_t pasid,
+                       pasid_notify pasid_fn, void *data, uint32_t
+reset);

> So we ultimately switch to above poison consumption handler for all the existing v9 adapters, right? If so, we shall be able to make this function backwards compatible. I'm wondering if we can just change the existing amdgpu_amdkfd_ras_poison_consumption_handler.

> Pasid_poison_consumption_handler is a little bit confusing.

[Thomas] No,  Only  UMC_HWIP  greater or equal to IP_VERSION(12, 0, 0)),  it works on  the new path.  The IP check is in amdgpu_umc_pasid_poison_handler function.



Regards,
Hawking

-----Original Message-----
From: Chai, Thomas <YiPeng.Chai@amd.com>
Sent: Thursday, April 18, 2024 10:59
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Subject: [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption

Prepare to handle pasid poison consumption.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  9 ++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  5 +++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c       | 20 ++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h       |  3 +++
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  3 ++-
 5 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 66753940bb4d..287ce431901c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -759,10 +759,17 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
        return amdgpu_ras_get_fed_status(adev);  }

+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
+                               enum amdgpu_ras_block block, uint16_t pasid,
+                               pasid_notify pasid_fn, void *data, uint32_t reset) {
+       amdgpu_umc_pasid_poison_handler(adev, block, pasid, pasid_fn,
+data, reset); }
+
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
        enum amdgpu_ras_block block, uint32_t reset)  {
-       amdgpu_umc_poison_handler(adev, block, reset);
+       amdgpu_umc_pasid_poison_handler(adev, block, 0, NULL, NULL,
+ reset);
 }

 int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index ad50c7bbc326..54e15994d02b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -401,6 +401,11 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
                                struct tile_config *config);  void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
                        enum amdgpu_ras_block block, uint32_t reset);
+
+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, uint16_t pasid,
+                       pasid_notify pasid_fn, void *data, uint32_t
+reset);
+
 bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);  bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);  void amdgpu_amdkfd_block_mmu_notifications(void *p); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index dcda3d24bee3..8ebbca9e2e22 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -252,8 +252,9 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
        return 0;
 }

-int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-                       enum amdgpu_ras_block block, uint32_t reset)
+int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, uint16_t pasid,
+                       pasid_notify pasid_fn, void *data, uint32_t
+reset)
 {
        int ret = AMDGPU_RAS_SUCCESS;

@@ -291,16 +292,14 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,

                        amdgpu_ras_error_data_fini(&err_data);
                } else {
-                       if (reset) {
-                               amdgpu_umc_bad_page_polling_timeout(adev,
-                                                       reset, MAX_UMC_POISON_POLLING_TIME_SYNC);
-                       } else {
                                struct amdgpu_ras *con = amdgpu_ras_get_context(adev);

+                               amdgpu_ras_put_poison_req(adev,
+                                       block, pasid, pasid_fn, data,
+ reset);
+
                                atomic_inc(&con->page_retirement_req_cnt);

                                wake_up(&con->page_retirement_wq);
-                       }
                }
        } else {
                if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
@@ -313,6 +312,13 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
        return ret;
 }

+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, uint32_t reset) {
+       return amdgpu_umc_pasid_poison_handler(adev,
+                               block, 0, NULL, NULL, reset); }
+
 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
                void *ras_error_status,
                struct amdgpu_iv_entry *entry) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 9e77e6d48e3b..5f50c69c3cec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -106,6 +106,9 @@ int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);  int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);  int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
                        enum amdgpu_ras_block block, uint32_t reset);
+int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, uint16_t pasid,
+                       pasid_notify pasid_fn, void *data, uint32_t
+reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
                struct amdgpu_irq_src *source,
                struct amdgpu_iv_entry *entry); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index c368c70df3f4..6bf4bbc3cffa 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -201,7 +201,8 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
                        "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
                        client_id);

-       amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
+       amdgpu_amdkfd_ras_pasid_poison_consumption_handler(dev->adev,
+               block, pasid, NULL, NULL, reset);
 }

 static bool context_id_expected(struct kfd_dev *dev)
--
2.34.1



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* RE: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address
  2024-04-25  3:02   ` Zhang, Hawking
@ 2024-04-25  3:31     ` Chai, Thomas
  0 siblings, 0 replies; 29+ messages in thread
From: Chai, Thomas @ 2024-04-25  3:31 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx
  Cc: Zhou1, Tao, Li, Candice, Wang, Yang(Kevin), Yang, Stanley

[AMD Official Use Only - General]

amdgpu_umc_fill_error_record  is called in umc_v12_0_convert_error_address directly to prepare for page retirement,
The new path need to check if these converted pages already exist before filling the error page,  umc_v12_0_convert_error_address is not suitable for new requirements, so I created a new interface.

-----------------
Best Regards,
Thomas

-----Original Message-----
From: Zhang, Hawking <Hawking.Zhang@amd.com>
Sent: Thursday, April 25, 2024 11:03 AM
To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>
Subject: RE: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address

[AMD Official Use Only - General]

I might lose some context here. Can you please elaborate why we don't leverage the existing umc_v12_0_convert_error_address implementation?

Regards,
Hawking

-----Original Message-----
From: Chai, Thomas <YiPeng.Chai@amd.com>
Sent: Thursday, April 18, 2024 10:58
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Subject: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address

Umc v12_0 converts error address.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 94 +++++++++++++++++++++++++-  drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 12 ++++
 2 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 81435533c4a7..085dcfe16b5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -222,6 +222,66 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
        }
 }

+static int umc_v12_0_convert_err_addr(struct amdgpu_device *adev,
+                               struct ta_ras_query_address_input *addr_in,
+                               uint64_t *pfns, int len) {
+       uint32_t col, row, row_xor, bank, channel_index;
+       uint64_t soc_pa, retired_page, column, err_addr;
+       struct ta_ras_query_address_output addr_out;
+       uint32_t pos = 0;
+
+       err_addr = addr_in->ma.err_addr;
+       addr_in->addr_type = TA_RAS_MCA_TO_PA;
+       if (psp_ras_query_address(&adev->psp, addr_in, &addr_out)) {
+               dev_warn(adev->dev, "Failed to query RAS physical address for 0x%llx",
+                       err_addr);
+               return 0;
+       }
+
+       soc_pa = addr_out.pa.pa;
+       bank = addr_out.pa.bank;
+       channel_index = addr_out.pa.channel_idx;
+
+       col = (err_addr >> 1) & 0x1fULL;
+       row = (err_addr >> 10) & 0x3fffULL;
+       row_xor = row ^ (0x1ULL << 13);
+       /* clear [C3 C2] in soc physical address */
+       soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
+       /* clear [C4] in soc physical address */
+       soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
+
+       /* loop for all possibilities of [C4 C3 C2] */
+       for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
+               retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
+               retired_page |= (((column & 0x4) >> 2) <<
+ UMC_V12_0_PA_C4_BIT);
+
+               if (pos >= len)
+                       return 0;
+               pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+
+               /* include column bit 0 and 1 */
+               col &= 0x3;
+               col |= (column << 2);
+               dev_info(adev->dev,
+                       "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
+                       retired_page, row, col, bank, channel_index);
+
+               /* shift R13 bit */
+               retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
+
+               if (pos >= len)
+                       return 0;
+               pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+
+               dev_info(adev->dev,
+                       "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
+                       retired_page, row_xor, col, bank, channel_index);
+       }
+
+       return pos;
+}
+
 static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
                                        uint32_t node_inst, uint32_t umc_inst,
                                        uint32_t ch_inst, void *data) @@ -482,8 +542,12 @@ static int umc_v12_0_ras_late_init(struct amdgpu_device *adev, struct ras_common  static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
                        uint64_t status, uint64_t ipid, uint64_t addr)  {
-       uint16_t hwid, mcatype;
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       uint16_t hwid, mcatype;
+       struct ta_ras_query_address_input addr_in;
+       uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
+       uint64_t err_addr;
+       int count;

        hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
        mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); @@ -497,6 +561,34 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
        if (!umc_v12_0_is_deferred_error(adev, status))
                return 0;

+       err_addr = REG_GET_FIELD(addr,
+                               MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
+
+       dev_info(adev->dev,
+               "UMC:IPID:0x%llx, socket:%llu, aid:%llu, inst:%llu, ch:%llu, err_addr:0x%llx\n",
+               ipid,
+               MCA_IPID_2_SOCKET_ID(ipid),
+               MCA_IPID_2_DIE_ID(ipid),
+               MCA_IPID_2_UMC_INST(ipid),
+               MCA_IPID_2_UMC_CH(ipid),
+               err_addr);
+
+       memset(page_pfn, 0, sizeof(page_pfn));
+
+       memset(&addr_in, 0, sizeof(addr_in));
+       addr_in.ma.err_addr = err_addr;
+       addr_in.ma.ch_inst = MCA_IPID_2_UMC_CH(ipid);
+       addr_in.ma.umc_inst = MCA_IPID_2_UMC_INST(ipid);
+       addr_in.ma.node_inst = MCA_IPID_2_DIE_ID(ipid);
+       addr_in.ma.socket_id = MCA_IPID_2_SOCKET_ID(ipid);
+
+       count = umc_v12_0_convert_err_addr(adev,
+                               &addr_in, page_pfn, ARRAY_SIZE(page_pfn));
+       if (count <= 0) {
+               dev_warn(adev->dev, "Fail to convert error address! count:%d\n", count);
+               return 0;
+       }
+
        con->umc_ecc_log.de_updated = true;

        return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
index 5c2d7e127608..b4974793850b 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
@@ -69,6 +69,18 @@
                        (((_ipid_lo) >> 12) & 0xF))  #define MCA_IPID_LO_2_UMC_INST(_ipid_lo) (((_ipid_lo) >> 21) & 0x7)

+#define MCA_IPID_2_DIE_ID(ipid)  ((REG_GET_FIELD(ipid, MCMP1_IPIDT0,
+InstanceIdHi) >> 2) & 0x03)
+
+#define MCA_IPID_2_UMC_CH(ipid) \
+       (MCA_IPID_LO_2_UMC_CH(REG_GET_FIELD(ipid, MCMP1_IPIDT0,
+InstanceIdLo)))
+
+#define MCA_IPID_2_UMC_INST(ipid) \
+       (MCA_IPID_LO_2_UMC_INST(REG_GET_FIELD(ipid, MCMP1_IPIDT0,
+InstanceIdLo)))
+
+#define MCA_IPID_2_SOCKET_ID(ipid) \
+       (((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo) & 0x1) << 2) | \
+        (REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) & 0x03))
+
 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status);  bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);  bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
--
2.34.1



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* RE: [PATCH 04/15] drm/amdgpu: add poison creation handler
  2024-04-25  2:32   ` Zhang, Hawking
@ 2024-04-25  3:35     ` Chai, Thomas
  0 siblings, 0 replies; 29+ messages in thread
From: Chai, Thomas @ 2024-04-25  3:35 UTC (permalink / raw)
  To: Zhang, Hawking, amd-gfx
  Cc: Zhou1, Tao, Li, Candice, Wang, Yang(Kevin), Yang, Stanley

[AMD Official Use Only - General]

OK, I will do this.


-----------------
Best Regards,
Thomas

-----Original Message-----
From: Zhang, Hawking <Hawking.Zhang@amd.com>
Sent: Thursday, April 25, 2024 10:33 AM
To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Subject: RE: [PATCH 04/15] drm/amdgpu: add poison creation handler

[AMD Official Use Only - General]

Is it okay to drop below static function and just implement the logic in poison creation handler leveraging the ras query api: amdgpu_ras_query_error_status.

It seems to me the static function may not be able to be used for other IP blocks.

Regards,
Hawking

+ static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block ras_block, uint32_t timeout_ms) {
+       int ret = 0;
+       struct ras_ecc_log_info *ecc_log;
+       struct ras_query_if info;
+       uint32_t timeout = timeout_ms;
+       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+       memset(&info, 0, sizeof(info));
+       info.head.block = ras_block;
+
+       ecc_log = &ras->umc_ecc_log;
+       ecc_log->de_updated = false;
+       do {
+               ret = amdgpu_ras_query_error_status(adev, &info);
+               if (ret) {
+                       dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret);
+                       return ret;
+               }
+
+               if (timeout && !ecc_log->de_updated) {
+                       msleep(1);
+                       timeout--;
+               }
+       } while (timeout && !ecc_log->de_updated);
+
+       if (timeout_ms && !timeout) {
+               dev_warn(adev->dev, "Can't find deferred error\n");
+               return -ETIMEDOUT;
+       }
+
+       return 0;
+}
+
+static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
+                                       uint32_t timeout) {
+       amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC,
+timeout); }
+

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of YiPeng Chai
Sent: Thursday, April 18, 2024 10:58
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
Subject: [PATCH 04/15] drm/amdgpu: add poison creation handler

Add poison creation handler.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 74 +++++++++++++++++++++++--
 1 file changed, 69 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 64e6e20c6de7..126616eaeec1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2080,6 +2080,17 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj  {
        dev_info(obj->adev->dev,
                "Poison is created\n");
+
+       if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
+               struct amdgpu_ras *con =
+ amdgpu_ras_get_context(obj->adev);
+
+               amdgpu_ras_put_poison_req(obj->adev,
+                       AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false);
+
+               atomic_inc(&con->page_retirement_req_cnt);
+
+               wake_up(&con->page_retirement_wq);
+       }
 }

 static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, @@ -2754,10 +2765,54 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
        mutex_destroy(&ecc_log->lock);
        ecc_log->de_updated = false;
 }
+
+static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block ras_block, uint32_t timeout_ms) {
+       int ret = 0;
+       struct ras_ecc_log_info *ecc_log;
+       struct ras_query_if info;
+       uint32_t timeout = timeout_ms;
+       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+       memset(&info, 0, sizeof(info));
+       info.head.block = ras_block;
+
+       ecc_log = &ras->umc_ecc_log;
+       ecc_log->de_updated = false;
+       do {
+               ret = amdgpu_ras_query_error_status(adev, &info);
+               if (ret) {
+                       dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret);
+                       return ret;
+               }
+
+               if (timeout && !ecc_log->de_updated) {
+                       msleep(1);
+                       timeout--;
+               }
+       } while (timeout && !ecc_log->de_updated);
+
+       if (timeout_ms && !timeout) {
+               dev_warn(adev->dev, "Can't find deferred error\n");
+               return -ETIMEDOUT;
+       }
+
+       return 0;
+}
+
+static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
+                                       uint32_t timeout) {
+       amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC,
+timeout); }
+
 static int amdgpu_ras_page_retirement_thread(void *param)  {
        struct amdgpu_device *adev = (struct amdgpu_device *)param;
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct ras_poison_msg poison_msg;
+       enum amdgpu_ras_block ras_block;

        while (!kthread_should_stop()) {

@@ -2768,13 +2823,22 @@ static int amdgpu_ras_page_retirement_thread(void *param)
                if (kthread_should_stop())
                        break;

-               dev_info(adev->dev, "Start processing page retirement. request:%d\n",
-                       atomic_read(&con->page_retirement_req_cnt));
-
                atomic_dec(&con->page_retirement_req_cnt);

-               amdgpu_umc_bad_page_polling_timeout(adev,
-                               0, MAX_UMC_POISON_POLLING_TIME_ASYNC);
+               if (!amdgpu_ras_get_poison_req(adev, &poison_msg))
+                       continue;
+
+               ras_block = poison_msg.block;
+
+               dev_info(adev->dev, "Start processing ras block %s(%d)\n",
+                               ras_block_str(ras_block), ras_block);
+
+               if (ras_block == AMDGPU_RAS_BLOCK__UMC)
+                       amdgpu_ras_poison_creation_handler(adev,
+                               MAX_UMC_POISON_POLLING_TIME_ASYNC);
+               else
+                       amdgpu_umc_bad_page_polling_timeout(adev,
+                               false,
+ MAX_UMC_POISON_POLLING_TIME_ASYNC);
        }

        return 0;
--
2.34.1



^ permalink raw reply related	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2024-04-25  3:35 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-18  2:58 [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page YiPeng Chai
2024-04-18  2:58 ` [PATCH 02/15] drm/amdgpu: add message fifo to handle RAS poison events YiPeng Chai
2024-04-18  2:58 ` [PATCH 03/15] drm/amdgpu: prepare for logging ecc errors YiPeng Chai
2024-04-18  2:58 ` [PATCH 04/15] drm/amdgpu: add poison creation handler YiPeng Chai
2024-04-25  2:32   ` Zhang, Hawking
2024-04-25  3:35     ` Chai, Thomas
2024-04-18  2:58 ` [PATCH 05/15] drm/amdgpu: add interface to update umc v12_0 ecc status YiPeng Chai
2024-04-18  2:58 ` [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address YiPeng Chai
2024-04-25  3:02   ` Zhang, Hawking
2024-04-25  3:31     ` Chai, Thomas
2024-04-18  2:58 ` [PATCH 07/15] drm/amdgpu: umc v12_0 logs ecc errors YiPeng Chai
2024-04-18  2:58 ` [PATCH 08/15] drm/amdgpu: Add delay work to retire bad pages YiPeng Chai
2024-04-18  2:58 ` [PATCH 09/15] drm/amdgpu: add condition check for amdgpu_umc_fill_error_record YiPeng Chai
2024-04-18  2:58 ` [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0 YiPeng Chai
2024-04-22  8:14   ` Zhou1, Tao
2024-04-22  9:21     ` Chai, Thomas
2024-04-22  9:33       ` Chai, Thomas
2024-04-18  2:58 ` [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption YiPeng Chai
2024-04-25  3:00   ` Zhang, Hawking
2024-04-25  3:17     ` Chai, Thomas
2024-04-18  2:58 ` [PATCH 12/15] drm/amdgpu: add poison consumption handler YiPeng Chai
2024-04-18  2:58 ` [PATCH 13/15] drm/amdgpu: support ACA logging ecc errors YiPeng Chai
2024-04-18  2:58 ` [PATCH 14/15] drm/amdgpu: Fix address translation defect YiPeng Chai
2024-04-18  2:58 ` [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page YiPeng Chai
2024-04-18  9:00   ` Christian König
2024-04-18  9:34     ` Chai, Thomas
2024-04-22  8:27       ` Zhou1, Tao
2024-04-22  2:25   ` Chai, Thomas
2024-04-22  7:06 ` [PATCH 01/15] drm/amdgpu: Add " Christian König

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).