All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] add support for ras page retirement
@ 2019-08-30 12:24 Tao Zhou
       [not found] ` <20190830122453.19703-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Tao Zhou @ 2019-08-30 12:24 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	andrey.grodzovsky-5C7GfCeVMHo, guchun.chen-5C7GfCeVMHo,
	dennis.li-5C7GfCeVMHo, hawking.zhang-5C7GfCeVMHo
  Cc: Tao Zhou

This series saves umc error page info into a record structure and stores
records to eeprom, it also loads error records from eeprom and reservers
related retired pages during gpu init.


Tao Zhou (4):
  drm/amdgpu: change ras bps type to eeprom table record structure
  drm/amdgpu: Hook EEPROM table to RAS
  drm/amdgpu: save umc error records
  drm/amdgpu: move the call of ras recovery_init and bad page reserve to
    proper place

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  16 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 170 +++++++++++++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  18 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  29 +++-
 drivers/gpu/drm/amd/amdgpu/umc_v6_1.c      |  39 ++++-
 5 files changed, 202 insertions(+), 70 deletions(-)

-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure
       [not found] ` <20190830122453.19703-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
@ 2019-08-30 12:24   ` Tao Zhou
       [not found]     ` <20190830122453.19703-2-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
  2019-08-30 12:24   ` [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS Tao Zhou
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Tao Zhou @ 2019-08-30 12:24 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	andrey.grodzovsky-5C7GfCeVMHo, guchun.chen-5C7GfCeVMHo,
	dennis.li-5C7GfCeVMHo, hawking.zhang-5C7GfCeVMHo
  Cc: Tao Zhou

change bps type from retired page to eeprom table record, prepare for
saving error records to eeprom

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 59 ++++++++++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 11 +++--
 2 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2ca3997d4b3a..24663ec41248 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1187,14 +1187,14 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
 
 	for (; i < data->count; i++) {
 		(*bps)[i] = (struct ras_badpage){
-			.bp = data->bps[i].bp,
+			.bp = data->bps[i].retired_page,
 			.size = AMDGPU_GPU_PAGE_SIZE,
 			.flags = 0,
 		};
 
 		if (data->last_reserved <= i)
 			(*bps)[i].flags = 1;
-		else if (data->bps[i].bo == NULL)
+		else if (data->bps_bo[i] == NULL)
 			(*bps)[i].flags = 2;
 	}
 
@@ -1288,30 +1288,40 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
 {
 	unsigned int old_space = data->count + data->space_left;
 	unsigned int new_space = old_space + pages;
-	unsigned int align_space = ALIGN(new_space, 1024);
-	void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
-
-	if (!tmp)
+	unsigned int align_space = ALIGN(new_space, 512);
+	void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
+	struct amdgpu_bo **bps_bo =
+			kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL);
+
+	if (!bps || !bps_bo) {
+		kfree(bps);
+		kfree(bps_bo);
 		return -ENOMEM;
+	}
 
 	if (data->bps) {
-		memcpy(tmp, data->bps,
+		memcpy(bps, data->bps,
 				data->count * sizeof(*data->bps));
 		kfree(data->bps);
 	}
+	if (data->bps_bo) {
+		memcpy(bps_bo, data->bps_bo,
+				data->count * sizeof(*data->bps_bo));
+		kfree(data->bps_bo);
+	}
 
-	data->bps = tmp;
+	data->bps = bps;
+	data->bps_bo = bps_bo;
 	data->space_left += align_space - old_space;
 	return 0;
 }
 
 /* it deal with vram only. */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
-		unsigned long *bps, int pages)
+		struct eeprom_table_record *bps, int pages)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data *data;
-	int i = pages;
 	int ret = 0;
 
 	if (!con || !con->eh_data || !bps || pages <= 0)
@@ -1328,10 +1338,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 			goto out;
 		}
 
-	while (i--)
-		data->bps[data->count++].bp = bps[i];
-
+	memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
+	data->count += pages;
 	data->space_left -= pages;
+
 out:
 	mutex_unlock(&con->recovery_lock);
 
@@ -1356,13 +1366,13 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 		goto out;
 	/* reserve vram at driver post stage. */
 	for (i = data->last_reserved; i < data->count; i++) {
-		bp = data->bps[i].bp;
+		bp = data->bps[i].retired_page;
 
 		if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
 					PAGE_SIZE, &bo))
 			DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
 
-		data->bps[i].bo = bo;
+		data->bps_bo[i] = bo;
 		data->last_reserved = i + 1;
 	}
 out:
@@ -1387,11 +1397,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
 		goto out;
 
 	for (i = data->last_reserved - 1; i >= 0; i--) {
-		bo = data->bps[i].bo;
+		bo = data->bps_bo[i];
 
 		amdgpu_ras_release_vram(adev, &bo);
 
-		data->bps[i].bo = bo;
+		data->bps_bo[i] = bo;
 		data->last_reserved = i;
 	}
 out:
@@ -1407,12 +1417,19 @@ static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
 	return 0;
 }
 
+/*
+ * read error record array in eeprom and reserve enough space for
+ * storing new bad pages
+ */
 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
 {
-	/* TODO
-	 * read the array to eeprom when SMU disabled.
-	 */
-	return 0;
+	struct eeprom_table_record *bps = NULL;
+	int ret;
+
+	ret = amdgpu_ras_add_bad_pages(adev, bps,
+				adev->umc.max_ras_err_cnt_per_query);
+
+	return ret;
 }
 
 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 66b71525446e..b6bac873c588 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -351,11 +351,10 @@ struct ras_err_data {
 };
 
 struct ras_err_handler_data {
-	/* point to bad pages array */
-	struct {
-		unsigned long bp;
-		struct amdgpu_bo *bo;
-	} *bps;
+	/* point to bad page records array */
+	struct eeprom_table_record *bps;
+	/* point to reserved bo array */
+	struct amdgpu_bo **bps_bo;
 	/* the count of entries */
 	int count;
 	/* the space can place new entries */
@@ -492,7 +491,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
-		unsigned long *bps, int pages);
+		struct eeprom_table_record *bps, int pages);
 
 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev);
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS
       [not found] ` <20190830122453.19703-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
  2019-08-30 12:24   ` [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure Tao Zhou
@ 2019-08-30 12:24   ` Tao Zhou
       [not found]     ` <20190830122453.19703-3-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
  2019-08-30 12:24   ` [PATCH 3/4] drm/amdgpu: save umc error records Tao Zhou
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Tao Zhou @ 2019-08-30 12:24 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	andrey.grodzovsky-5C7GfCeVMHo, guchun.chen-5C7GfCeVMHo,
	dennis.li-5C7GfCeVMHo, hawking.zhang-5C7GfCeVMHo
  Cc: Tao Zhou

support eeprom records load and save for ras,
move EEPROM records storing to bad page reserving

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 111 ++++++++++++++++++------
 1 file changed, 83 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 24663ec41248..02120aa3cb5d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1348,6 +1348,72 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 	return ret;
 }
 
+/*
+ * write error record array to eeprom, the function should be
+ * protected by recovery_lock
+ */
+static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_err_handler_data *data;
+	struct amdgpu_ras_eeprom_control *control =
+					&adev->psp.ras.ras->eeprom_control;
+	int save_count;
+
+	if (!con || !con->eh_data)
+		return 0;
+
+	data = con->eh_data;
+	if (!data)
+		return 0;
+
+	save_count = data->count - control->num_recs;
+	/* only new entries are saved */
+	if (save_count > 0)
+		if (amdgpu_ras_eeprom_process_recods(&con->eeprom_control,
+							&data->bps[control->num_recs],
+							true,
+							save_count)) {
+			DRM_ERROR("Failed to save EEPROM table data!");
+			return -EIO;
+		}
+
+	return 0;
+}
+
+/*
+ * read error record array in eeprom and reserve enough space for
+ * storing new bad pages
+ */
+static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras_eeprom_control *control =
+					&adev->psp.ras.ras->eeprom_control;
+	struct eeprom_table_record *bps = NULL;
+	int ret = 0;
+
+	/* no bad page record, skip eeprom access */
+	if (!control->num_recs)
+		return ret;
+
+	bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
+	if (!bps)
+		return -ENOMEM;
+
+	if (amdgpu_ras_eeprom_process_recods(control, bps, false,
+		control->num_recs)) {
+		DRM_ERROR("Failed to load EEPROM table records!");
+		ret = -EIO;
+		goto out;
+	}
+
+	ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
+
+out:
+	kfree(bps);
+	return ret;
+}
+
 /* called in gpu recovery/init */
 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 {
@@ -1355,7 +1421,7 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 	struct ras_err_handler_data *data;
 	uint64_t bp;
 	struct amdgpu_bo *bo;
-	int i;
+	int i, ret = 0;
 
 	if (!con || !con->eh_data)
 		return 0;
@@ -1375,9 +1441,11 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 		data->bps_bo[i] = bo;
 		data->last_reserved = i + 1;
 	}
+
+	ret = amdgpu_ras_save_bad_pages(adev);
 out:
 	mutex_unlock(&con->recovery_lock);
-	return 0;
+	return ret;
 }
 
 /* called when driver unload */
@@ -1409,33 +1477,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
 	return 0;
 }
 
-static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
-{
-	/* TODO
-	 * write the array to eeprom when SMU disabled.
-	 */
-	return 0;
-}
-
-/*
- * read error record array in eeprom and reserve enough space for
- * storing new bad pages
- */
-static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
-{
-	struct eeprom_table_record *bps = NULL;
-	int ret;
-
-	ret = amdgpu_ras_add_bad_pages(adev, bps,
-				adev->umc.max_ras_err_cnt_per_query);
-
-	return ret;
-}
-
 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data **data = &con->eh_data;
+	int ret;
 
 	*data = kmalloc(sizeof(**data),
 			GFP_KERNEL|__GFP_ZERO);
@@ -1447,8 +1493,18 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 	atomic_set(&con->in_recovery, 0);
 	con->adev = adev;
 
-	amdgpu_ras_load_bad_pages(adev);
-	amdgpu_ras_reserve_bad_pages(adev);
+	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
+	if (ret)
+		return ret;
+
+	if (adev->psp.ras.ras->eeprom_control.num_recs) {
+		ret = amdgpu_ras_load_bad_pages(adev);
+		if (ret)
+			return ret;
+		ret = amdgpu_ras_reserve_bad_pages(adev);
+		if (ret)
+			return ret;
+	}
 
 	return 0;
 }
@@ -1459,7 +1515,6 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
 	struct ras_err_handler_data *data = con->eh_data;
 
 	cancel_work_sync(&con->recovery_work);
-	amdgpu_ras_save_bad_pages(adev);
 	amdgpu_ras_release_bad_pages(adev);
 
 	mutex_lock(&con->recovery_lock);
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/4] drm/amdgpu: save umc error records
       [not found] ` <20190830122453.19703-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
  2019-08-30 12:24   ` [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure Tao Zhou
  2019-08-30 12:24   ` [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS Tao Zhou
@ 2019-08-30 12:24   ` Tao Zhou
  2019-08-30 12:24   ` [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place Tao Zhou
  2019-09-02  2:25   ` [PATCH 0/4] add support for ras page retirement Chen, Guchun
  4 siblings, 0 replies; 12+ messages in thread
From: Tao Zhou @ 2019-08-30 12:24 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	andrey.grodzovsky-5C7GfCeVMHo, guchun.chen-5C7GfCeVMHo,
	dennis.li-5C7GfCeVMHo, hawking.zhang-5C7GfCeVMHo
  Cc: Tao Zhou

save umc error records to ras bad page array

v2: add bad pages before gpu reset

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 29 ++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/umc_v6_1.c   | 39 ++++++++++++++++++++-----
 3 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index b6bac873c588..42e1d379e21c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -347,7 +347,7 @@ struct ras_err_data {
 	unsigned long ue_count;
 	unsigned long ce_count;
 	unsigned long err_addr_cnt;
-	uint64_t *err_addr;
+	struct eeprom_table_record *err_addr;
 };
 
 struct ras_err_handler_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 70a05e302d60..5015a07dcb3c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -246,16 +246,35 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
 	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
 	if (adev->umc.funcs->query_ras_error_count)
 		adev->umc.funcs->query_ras_error_count(adev, err_data);
-	/* umc query_ras_error_address is also responsible for clearing
-	 * error status
-	 */
-	if (adev->umc.funcs->query_ras_error_address)
+
+	if (adev->umc.funcs->query_ras_error_address &&
+		adev->umc.max_ras_err_cnt_per_query) {
+		err_data->err_addr =
+			kcalloc(adev->umc.max_ras_err_cnt_per_query,
+				sizeof(struct eeprom_table_record), GFP_KERNEL);
+		/* still call query_ras_error_address to clear error status
+		 * even NOMEM error is encountered
+		 */
+		if(!err_data->err_addr)
+			DRM_WARN("Failed to alloc memory for umc error address record!\n");
+
+		/* umc query_ras_error_address is also responsible for clearing
+		 * error status
+		 */
 		adev->umc.funcs->query_ras_error_address(adev, err_data);
+	}
 
 	/* only uncorrectable error needs gpu reset */
-	if (err_data->ue_count)
+	if (err_data->ue_count) {
+		if (err_data->err_addr_cnt &&
+			amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
+						err_data->err_addr_cnt))
+			DRM_WARN("Failed to add ras bad page!\n");
+
 		amdgpu_ras_reset_gpu(adev, 0);
+	}
 
+	kfree(err_data->err_addr);
 	return AMDGPU_RAS_SUCCESS;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
index 8502e736f721..a54ff170591f 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
@@ -75,6 +75,17 @@ static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev)
 			RSMU_UMC_INDEX_MODE_EN, 0);
 }
 
+static uint32_t umc_v6_1_get_umc_inst(struct amdgpu_device *adev)
+{
+	uint32_t rsmu_umc_index;
+
+	rsmu_umc_index = RREG32_SOC15(RSMU, 0,
+			mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU);
+	return REG_GET_FIELD(rsmu_umc_index,
+				RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
+				RSMU_UMC_INDEX_INSTANCE);
+}
+
 static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
 						   uint32_t umc_reg_offset,
 						   unsigned long *error_count)
@@ -165,7 +176,8 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
 					 uint32_t umc_reg_offset, uint32_t channel_index)
 {
 	uint32_t lsb, mc_umc_status_addr;
-	uint64_t mc_umc_status, err_addr;
+	uint64_t mc_umc_status, err_addr, retired_page;
+	struct eeprom_table_record *err_rec;
 
 	mc_umc_status_addr =
 		SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
@@ -177,6 +189,7 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
 		return;
 	}
 
+	err_rec = &err_data->err_addr[err_data->err_addr_cnt];
 	mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset);
 
 	/* calculate error address if ue/ce error is detected */
@@ -191,12 +204,24 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
 		err_addr &= ~((0x1ULL << lsb) - 1);
 
 		/* translate umc channel address to soc pa, 3 parts are included */
-		err_data->err_addr[err_data->err_addr_cnt] =
-						ADDR_OF_8KB_BLOCK(err_addr) |
-						ADDR_OF_256B_BLOCK(channel_index) |
-						OFFSET_IN_256B_BLOCK(err_addr);
-
-		err_data->err_addr_cnt++;
+		retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
+				ADDR_OF_256B_BLOCK(channel_index) |
+				OFFSET_IN_256B_BLOCK(err_addr);
+
+		/* we only save ue error information currently, ce is skipped */
+		if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
+				== 1) {
+			err_rec->address = err_addr;
+			/* page frame address is saved */
+			err_rec->retired_page = retired_page >> PAGE_SHIFT;
+			err_rec->ts = (uint64_t)ktime_get_real_seconds();
+			err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
+			err_rec->cu = 0;
+			err_rec->mem_channel = channel_index;
+			err_rec->mcumc_id = umc_v6_1_get_umc_inst(adev);
+
+			err_data->err_addr_cnt++;
+		}
 	}
 
 	/* clear umc status */
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place
       [not found] ` <20190830122453.19703-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
                     ` (2 preceding siblings ...)
  2019-08-30 12:24   ` [PATCH 3/4] drm/amdgpu: save umc error records Tao Zhou
@ 2019-08-30 12:24   ` Tao Zhou
       [not found]     ` <20190830122453.19703-5-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
  2019-09-02  2:25   ` [PATCH 0/4] add support for ras page retirement Chen, Guchun
  4 siblings, 1 reply; 12+ messages in thread
From: Tao Zhou @ 2019-08-30 12:24 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	andrey.grodzovsky-5C7GfCeVMHo, guchun.chen-5C7GfCeVMHo,
	dennis.li-5C7GfCeVMHo, hawking.zhang-5C7GfCeVMHo
  Cc: Tao Zhou

ras recovery_init should be called after ttm and smu init,
bad page reserve should be put in front of gpu reset since i2c
may be unstable during gpu reset
add cleanup for recovery_init and recovery_fini

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 28 +++++++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  5 ++++
 3 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 67b09cb2a9e2..4e4895ac926d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2894,6 +2894,17 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 		goto failed;
 	}
 
+	/* retired pages will be loaded from eeprom and reserved here,
+	 * new bo may be created, it should be called after ttm init,
+	 * accessing eeprom also relies on smu (unlock i2c bus) and it
+	 * should be called after smu init
+	 *
+	 * Note: theoretically, this should be called before all vram allocations
+	 * to protect retired page from abusing, but there are some allocations
+	 * in front of smu fw loading
+	 */
+	amdgpu_ras_recovery_init(adev);
+
 	/* must succeed. */
 	amdgpu_ras_resume(adev);
 
@@ -3627,11 +3638,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 						break;
 				}
 			}
-
-			list_for_each_entry(tmp_adev, device_list_handle,
-					gmc.xgmi.head) {
-				amdgpu_ras_reserve_bad_pages(tmp_adev);
-			}
 		}
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 02120aa3cb5d..ad67a109122f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1477,14 +1477,13 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
 	return 0;
 }
 
-static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data **data = &con->eh_data;
 	int ret;
 
-	*data = kmalloc(sizeof(**data),
-			GFP_KERNEL|__GFP_ZERO);
+	*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
 	if (!*data)
 		return -ENOMEM;
 
@@ -1495,18 +1494,29 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 
 	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
 	if (ret)
-		return ret;
+		goto cancel_work;
 
 	if (adev->psp.ras.ras->eeprom_control.num_recs) {
 		ret = amdgpu_ras_load_bad_pages(adev);
 		if (ret)
-			return ret;
+			goto cancel_work;
 		ret = amdgpu_ras_reserve_bad_pages(adev);
 		if (ret)
-			return ret;
+			goto release;
 	}
 
 	return 0;
+
+release:
+	amdgpu_ras_release_bad_pages(adev);
+cancel_work:
+	cancel_work_sync(&con->recovery_work);
+	con->eh_data = NULL;
+	kfree((*data)->bps);
+	kfree((*data)->bps_bo);
+	kfree(*data);
+
+	return ret;
 }
 
 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
@@ -1520,6 +1530,7 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
 	mutex_lock(&con->recovery_lock);
 	con->eh_data = NULL;
 	kfree(data->bps);
+	kfree(data->bps_bo);
 	kfree(data);
 	mutex_unlock(&con->recovery_lock);
 
@@ -1611,9 +1622,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 			return r;
 	}
 
-	if (amdgpu_ras_recovery_init(adev))
-		goto recovery_out;
-
 	amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
 
 	if (amdgpu_ras_fs_init(adev))
@@ -1628,8 +1636,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 			con->hw_supported, con->supported);
 	return 0;
 fs_out:
-	amdgpu_ras_recovery_fini(adev);
-recovery_out:
 	amdgpu_ras_set_context(adev, NULL);
 	kfree(con);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 42e1d379e21c..6d00f79b612b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
 	return ras && (ras->supported & (1 << block));
 }
 
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
 		unsigned int block);
 
@@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,
 {
 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
+	/* save bad page to eeprom before gpu reset,
+	 * i2c may be unstable in gpu reset
+	 */
+	amdgpu_ras_reserve_bad_pages(adev);
 	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
 		schedule_work(&ras->recovery_work);
 	return 0;
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place
       [not found]     ` <20190830122453.19703-5-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
@ 2019-08-30 14:03       ` Grodzovsky, Andrey
       [not found]         ` <d70f5672-2d8e-8efe-7b08-9df1c87f98ba-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Grodzovsky, Andrey @ 2019-08-30 14:03 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Chen,
	Guchun, Li, Dennis, Zhang, Hawking


On 8/30/19 8:24 AM, Tao Zhou wrote:
> ras recovery_init should be called after ttm and smu init,
> bad page reserve should be put in front of gpu reset since i2c
> may be unstable during gpu reset
> add cleanup for recovery_init and recovery_fini
>
> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 28 +++++++++++++---------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  5 ++++
>   3 files changed, 33 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 67b09cb2a9e2..4e4895ac926d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2894,6 +2894,17 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   		goto failed;
>   	}
>   
> +	/* retired pages will be loaded from eeprom and reserved here,
> +	 * new bo may be created, it should be called after ttm init,
> +	 * accessing eeprom also relies on smu (unlock i2c bus) and it
> +	 * should be called after smu init
> +	 *
> +	 * Note: theoretically, this should be called before all vram allocations
> +	 * to protect retired page from abusing, but there are some allocations
> +	 * in front of smu fw loading
> +	 */
> +	amdgpu_ras_recovery_init(adev);


You probably should check for return value here.


> +
>   	/* must succeed. */
>   	amdgpu_ras_resume(adev);
>   
> @@ -3627,11 +3638,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   						break;
>   				}
>   			}
> -
> -			list_for_each_entry(tmp_adev, device_list_handle,
> -					gmc.xgmi.head) {
> -				amdgpu_ras_reserve_bad_pages(tmp_adev);
> -			}
>   		}
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 02120aa3cb5d..ad67a109122f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1477,14 +1477,13 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
>   	return 0;
>   }
>   
> -static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> +int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
>   {
>   	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>   	struct ras_err_handler_data **data = &con->eh_data;
>   	int ret;
>   
> -	*data = kmalloc(sizeof(**data),
> -			GFP_KERNEL|__GFP_ZERO);
> +	*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
>   	if (!*data)
>   		return -ENOMEM;
>   
> @@ -1495,18 +1494,29 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
>   
>   	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
>   	if (ret)
> -		return ret;
> +		goto cancel_work;


Why you need do go to cancel_work_sync(&con->recovery_work) at such 
early stage of device init, is RAS active already by this time and RAS 
interrupt might fire triggering reset  ?

Andrey


>   
>   	if (adev->psp.ras.ras->eeprom_control.num_recs) {
>   		ret = amdgpu_ras_load_bad_pages(adev);
>   		if (ret)
> -			return ret;
> +			goto cancel_work;
>   		ret = amdgpu_ras_reserve_bad_pages(adev);
>   		if (ret)
> -			return ret;
> +			goto release;
>   	}
>   
>   	return 0;
> +
> +release:
> +	amdgpu_ras_release_bad_pages(adev);
> +cancel_work:
> +	cancel_work_sync(&con->recovery_work);
> +	con->eh_data = NULL;
> +	kfree((*data)->bps);
> +	kfree((*data)->bps_bo);
> +	kfree(*data);
> +
> +	return ret;
>   }
>   
>   static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
> @@ -1520,6 +1530,7 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
>   	mutex_lock(&con->recovery_lock);
>   	con->eh_data = NULL;
>   	kfree(data->bps);
> +	kfree(data->bps_bo);
>   	kfree(data);
>   	mutex_unlock(&con->recovery_lock);
>   
> @@ -1611,9 +1622,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>   			return r;
>   	}
>   
> -	if (amdgpu_ras_recovery_init(adev))
> -		goto recovery_out;
> -
>   	amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
>   
>   	if (amdgpu_ras_fs_init(adev))
> @@ -1628,8 +1636,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>   			con->hw_supported, con->supported);
>   	return 0;
>   fs_out:
> -	amdgpu_ras_recovery_fini(adev);
> -recovery_out:
>   	amdgpu_ras_set_context(adev, NULL);
>   	kfree(con);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 42e1d379e21c..6d00f79b612b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
>   	return ras && (ras->supported & (1 << block));
>   }
>   
> +int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
>   int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
>   		unsigned int block);
>   
> @@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,
>   {
>   	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>   
> +	/* save bad page to eeprom before gpu reset,
> +	 * i2c may be unstable in gpu reset
> +	 */
> +	amdgpu_ras_reserve_bad_pages(adev);
>   	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
>   		schedule_work(&ras->recovery_work);
>   	return 0;
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS
       [not found]     ` <20190830122453.19703-3-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
@ 2019-09-02  2:11       ` Chen, Guchun
       [not found]         ` <SN6PR12MB2813A05D3E8BCC723AE50308F1BE0-kxOKjb6HO/Hw8A9fYknAbAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Chen, Guchun @ 2019-09-02  2:11 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Grodzovsky, Andrey, Li,
	Dennis, Zhang, Hawking
  Cc: Zhou1, Tao



-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tao Zhou
Sent: Friday, August 30, 2019 8:25 PM
To: amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS

support eeprom records load and save for ras, move EEPROM records storing to bad page reserving

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 111 ++++++++++++++++++------
 1 file changed, 83 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 24663ec41248..02120aa3cb5d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1348,6 +1348,72 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 	return ret;
 }
 
+/*
+ * write error record array to eeprom, the function should be
+ * protected by recovery_lock
+ */
+static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) {
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_err_handler_data *data;
+	struct amdgpu_ras_eeprom_control *control =
+					&adev->psp.ras.ras->eeprom_control;
+	int save_count;
+
+	if (!con || !con->eh_data)
+		return 0;
+
+	data = con->eh_data;
+	if (!data)
+		return 0;
[Guchun]Such check (!data) is redundant and not needed. As we have checked !con->eh_data earlier, and the whole function is protected by recovery_lock.

+	save_count = data->count - control->num_recs;
+	/* only new entries are saved */
+	if (save_count > 0)
+		if (amdgpu_ras_eeprom_process_recods(&con->eeprom_control,
+							&data->bps[control->num_recs],
+							true,
+							save_count)) {
+			DRM_ERROR("Failed to save EEPROM table data!");
+			return -EIO;
+		}
+
+	return 0;
+}
+
+/*
+ * read error record array in eeprom and reserve enough space for
+ * storing new bad pages
+ */
+static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) {
+	struct amdgpu_ras_eeprom_control *control =
+					&adev->psp.ras.ras->eeprom_control;
+	struct eeprom_table_record *bps = NULL;
+	int ret = 0;
+
+	/* no bad page record, skip eeprom access */
+	if (!control->num_recs)
+		return ret;
+
+	bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
+	if (!bps)
+		return -ENOMEM;
+
+	if (amdgpu_ras_eeprom_process_recods(control, bps, false,
+		control->num_recs)) {
+		DRM_ERROR("Failed to load EEPROM table records!");
+		ret = -EIO;
+		goto out;
+	}
+
+	ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
+
+out:
+	kfree(bps);
+	return ret;
+}
+
 /* called in gpu recovery/init */
 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)  { @@ -1355,7 +1421,7 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 	struct ras_err_handler_data *data;
 	uint64_t bp;
 	struct amdgpu_bo *bo;
-	int i;
+	int i, ret = 0;
 
 	if (!con || !con->eh_data)
 		return 0;
@@ -1375,9 +1441,11 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 		data->bps_bo[i] = bo;
 		data->last_reserved = i + 1;
 	}
+
+	ret = amdgpu_ras_save_bad_pages(adev);
 out:
 	mutex_unlock(&con->recovery_lock);
-	return 0;
+	return ret;
 }
 
 /* called when driver unload */
@@ -1409,33 +1477,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
 	return 0;
 }
 
-static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) -{
-	/* TODO
-	 * write the array to eeprom when SMU disabled.
-	 */
-	return 0;
-}
-
-/*
- * read error record array in eeprom and reserve enough space for
- * storing new bad pages
- */
-static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) -{
-	struct eeprom_table_record *bps = NULL;
-	int ret;
-
-	ret = amdgpu_ras_add_bad_pages(adev, bps,
-				adev->umc.max_ras_err_cnt_per_query);
-
-	return ret;
-}
-
 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data **data = &con->eh_data;
+	int ret;
 
 	*data = kmalloc(sizeof(**data),
 			GFP_KERNEL|__GFP_ZERO);
@@ -1447,8 +1493,18 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 	atomic_set(&con->in_recovery, 0);
 	con->adev = adev;
 
-	amdgpu_ras_load_bad_pages(adev);
-	amdgpu_ras_reserve_bad_pages(adev);
+	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
+	if (ret)
+		return ret;
+
+	if (adev->psp.ras.ras->eeprom_control.num_recs) {
+		ret = amdgpu_ras_load_bad_pages(adev);
+		if (ret)
+			return ret;
+		ret = amdgpu_ras_reserve_bad_pages(adev);
+		if (ret)
+			return ret;
+	}
 
 	return 0;
 }
@@ -1459,7 +1515,6 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
 	struct ras_err_handler_data *data = con->eh_data;
 
 	cancel_work_sync(&con->recovery_work);
-	amdgpu_ras_save_bad_pages(adev);
 	amdgpu_ras_release_bad_pages(adev);
 
 	mutex_lock(&con->recovery_lock);
--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* RE: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure
       [not found]     ` <20190830122453.19703-2-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
@ 2019-09-02  2:13       ` Chen, Guchun
       [not found]         ` <SN6PR12MB2813C145D6004891FAF398B5F1BE0-kxOKjb6HO/Hw8A9fYknAbAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Chen, Guchun @ 2019-09-02  2:13 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Grodzovsky, Andrey, Li,
	Dennis, Zhang, Hawking
  Cc: Zhou1, Tao



-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tao Zhou
Sent: Friday, August 30, 2019 8:25 PM
To: amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure

change bps type from retired page to eeprom table record, prepare for saving error records to eeprom

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 59 ++++++++++++++++---------  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 11 +++--
 2 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2ca3997d4b3a..24663ec41248 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1187,14 +1187,14 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
 
 	for (; i < data->count; i++) {
 		(*bps)[i] = (struct ras_badpage){
-			.bp = data->bps[i].bp,
+			.bp = data->bps[i].retired_page,
 			.size = AMDGPU_GPU_PAGE_SIZE,
 			.flags = 0,
 		};
 
 		if (data->last_reserved <= i)
 			(*bps)[i].flags = 1;
-		else if (data->bps[i].bo == NULL)
+		else if (data->bps_bo[i] == NULL)
 			(*bps)[i].flags = 2;
 	}
 
@@ -1288,30 +1288,40 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,  {
 	unsigned int old_space = data->count + data->space_left;
 	unsigned int new_space = old_space + pages;
-	unsigned int align_space = ALIGN(new_space, 1024);
-	void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
-
-	if (!tmp)
+	unsigned int align_space = ALIGN(new_space, 512);
[Guchun]Any special reason to change alignment from 512 to 1024?

+	void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
+	struct amdgpu_bo **bps_bo =
+			kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL);
+
+	if (!bps || !bps_bo) {
+		kfree(bps);
+		kfree(bps_bo);
 		return -ENOMEM;
+	}
 
 	if (data->bps) {
-		memcpy(tmp, data->bps,
+		memcpy(bps, data->bps,
 				data->count * sizeof(*data->bps));
 		kfree(data->bps);
 	}
+	if (data->bps_bo) {
+		memcpy(bps_bo, data->bps_bo,
+				data->count * sizeof(*data->bps_bo));
+		kfree(data->bps_bo);
+	}
 
-	data->bps = tmp;
+	data->bps = bps;
+	data->bps_bo = bps_bo;
 	data->space_left += align_space - old_space;
 	return 0;
 }
 
 /* it deal with vram only. */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
-		unsigned long *bps, int pages)
+		struct eeprom_table_record *bps, int pages)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data *data;
-	int i = pages;
 	int ret = 0;
 
 	if (!con || !con->eh_data || !bps || pages <= 0) @@ -1328,10 +1338,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 			goto out;
 		}
 
-	while (i--)
-		data->bps[data->count++].bp = bps[i];
-
+	memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
+	data->count += pages;
 	data->space_left -= pages;
+
 out:
 	mutex_unlock(&con->recovery_lock);
 
@@ -1356,13 +1366,13 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 		goto out;
 	/* reserve vram at driver post stage. */
 	for (i = data->last_reserved; i < data->count; i++) {
-		bp = data->bps[i].bp;
+		bp = data->bps[i].retired_page;
 
 		if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
 					PAGE_SIZE, &bo))
 			DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
 
-		data->bps[i].bo = bo;
+		data->bps_bo[i] = bo;
 		data->last_reserved = i + 1;
 	}
 out:
@@ -1387,11 +1397,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
 		goto out;
 
 	for (i = data->last_reserved - 1; i >= 0; i--) {
-		bo = data->bps[i].bo;
+		bo = data->bps_bo[i];
 
 		amdgpu_ras_release_vram(adev, &bo);
 
-		data->bps[i].bo = bo;
+		data->bps_bo[i] = bo;
 		data->last_reserved = i;
 	}
 out:
@@ -1407,12 +1417,19 @@ static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
 	return 0;
 }
 
+/*
+ * read error record array in eeprom and reserve enough space for
+ * storing new bad pages
+ */
 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)  {
-	/* TODO
-	 * read the array to eeprom when SMU disabled.
-	 */
-	return 0;
+	struct eeprom_table_record *bps = NULL;
+	int ret;
+
+	ret = amdgpu_ras_add_bad_pages(adev, bps,
+				adev->umc.max_ras_err_cnt_per_query);
+
+	return ret;
 }
 
 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 66b71525446e..b6bac873c588 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -351,11 +351,10 @@ struct ras_err_data {  };
 
 struct ras_err_handler_data {
-	/* point to bad pages array */
-	struct {
-		unsigned long bp;
-		struct amdgpu_bo *bo;
-	} *bps;
+	/* point to bad page records array */
+	struct eeprom_table_record *bps;
+	/* point to reserved bo array */
+	struct amdgpu_bo **bps_bo;
 	/* the count of entries */
 	int count;
 	/* the space can place new entries */
@@ -492,7 +491,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
-		unsigned long *bps, int pages);
+		struct eeprom_table_record *bps, int pages);
 
 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev);
 
--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* RE: [PATCH 0/4] add support for ras page retirement
       [not found] ` <20190830122453.19703-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
                     ` (3 preceding siblings ...)
  2019-08-30 12:24   ` [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place Tao Zhou
@ 2019-09-02  2:25   ` Chen, Guchun
  4 siblings, 0 replies; 12+ messages in thread
From: Chen, Guchun @ 2019-09-02  2:25 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Grodzovsky, Andrey, Li,
	Dennis, Zhang, Hawking
  Cc: Zhou1, Tao

After the minor comment is addressed is Patch 1 and patch 2, the series are: Reviewed-by: Guchun Chen <guchun.chen@amd.com>

Regards,
Guchun

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tao Zhou
Sent: Friday, August 30, 2019 8:25 PM
To: amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 0/4] add support for ras page retirement

This series saves umc error page info into a record structure and stores records to eeprom, it also loads error records from eeprom and reservers related retired pages during gpu init.


Tao Zhou (4):
  drm/amdgpu: change ras bps type to eeprom table record structure
  drm/amdgpu: Hook EEPROM table to RAS
  drm/amdgpu: save umc error records
  drm/amdgpu: move the call of ras recovery_init and bad page reserve to
    proper place

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  16 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 170 +++++++++++++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  18 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  29 +++-
 drivers/gpu/drm/amd/amdgpu/umc_v6_1.c      |  39 ++++-
 5 files changed, 202 insertions(+), 70 deletions(-)

--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place
       [not found]         ` <d70f5672-2d8e-8efe-7b08-9df1c87f98ba-5C7GfCeVMHo@public.gmane.org>
@ 2019-09-02  2:58           ` Zhou1, Tao
  0 siblings, 0 replies; 12+ messages in thread
From: Zhou1, Tao @ 2019-09-02  2:58 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Chen, Guchun, Li, Dennis, Zhang, Hawking



> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Sent: 2019年8月30日 22:03
> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org;
> Chen, Guchun <Guchun.Chen@amd.com>; Li, Dennis <Dennis.Li@amd.com>;
> Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: Re: [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and
> bad page reserve to proper place
> 
> 
> On 8/30/19 8:24 AM, Tao Zhou wrote:
> > ras recovery_init should be called after ttm and smu init, bad page
> > reserve should be put in front of gpu reset since i2c may be unstable
> > during gpu reset add cleanup for recovery_init and recovery_fini
> >
> > Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++----
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 28 +++++++++++++-------
> --
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  5 ++++
> >   3 files changed, 33 insertions(+), 16 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 67b09cb2a9e2..4e4895ac926d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -2894,6 +2894,17 @@ int amdgpu_device_init(struct amdgpu_device
> *adev,
> >   		goto failed;
> >   	}
> >
> > +	/* retired pages will be loaded from eeprom and reserved here,
> > +	 * new bo may be created, it should be called after ttm init,
> > +	 * accessing eeprom also relies on smu (unlock i2c bus) and it
> > +	 * should be called after smu init
> > +	 *
> > +	 * Note: theoretically, this should be called before all vram allocations
> > +	 * to protect retired page from abusing, but there are some
> allocations
> > +	 * in front of smu fw loading
> > +	 */
> > +	amdgpu_ras_recovery_init(adev);
> 
> 
> You probably should check for return value here.
[Tao] No check here is intentional, the failure of recovery_init should not stop amdgpu init process and recovery_init could free resources allocated by itself if it fails. I'll add comment here and add a DRM_WARN for recovery_init.

> 
> 
> > +
> >   	/* must succeed. */
> >   	amdgpu_ras_resume(adev);
> >
> > @@ -3627,11 +3638,6 @@ static int amdgpu_do_asic_reset(struct
> amdgpu_hive_info *hive,
> >   						break;
> >   				}
> >   			}
> > -
> > -			list_for_each_entry(tmp_adev, device_list_handle,
> > -					gmc.xgmi.head) {
> > -				amdgpu_ras_reserve_bad_pages(tmp_adev);
> > -			}
> >   		}
> >   	}
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > index 02120aa3cb5d..ad67a109122f 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > @@ -1477,14 +1477,13 @@ static int
> amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
> >   	return 0;
> >   }
> >
> > -static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> > +int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> >   {
> >   	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> >   	struct ras_err_handler_data **data = &con->eh_data;
> >   	int ret;
> >
> > -	*data = kmalloc(sizeof(**data),
> > -			GFP_KERNEL|__GFP_ZERO);
> > +	*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
> >   	if (!*data)
> >   		return -ENOMEM;
> >
> > @@ -1495,18 +1494,29 @@ static int amdgpu_ras_recovery_init(struct
> > amdgpu_device *adev)
> >
> >   	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras-
> >eeprom_control);
> >   	if (ret)
> > -		return ret;
> > +		goto cancel_work;
> 
> 
> Why you need do go to cancel_work_sync(&con->recovery_work) at such
> early stage of device init, is RAS active already by this time and RAS interrupt
> might fire triggering reset  ?
> 
> Andrey
> 
[Tao] Good point, I'll remove cancel_work_sync.

> 
> >
> >   	if (adev->psp.ras.ras->eeprom_control.num_recs) {
> >   		ret = amdgpu_ras_load_bad_pages(adev);
> >   		if (ret)
> > -			return ret;
> > +			goto cancel_work;
> >   		ret = amdgpu_ras_reserve_bad_pages(adev);
> >   		if (ret)
> > -			return ret;
> > +			goto release;
> >   	}
> >
> >   	return 0;
> > +
> > +release:
> > +	amdgpu_ras_release_bad_pages(adev);
> > +cancel_work:
> > +	cancel_work_sync(&con->recovery_work);
> > +	con->eh_data = NULL;
> > +	kfree((*data)->bps);
> > +	kfree((*data)->bps_bo);
> > +	kfree(*data);
> > +
> > +	return ret;
> >   }
> >
> >   static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) @@
> > -1520,6 +1530,7 @@ static int amdgpu_ras_recovery_fini(struct
> amdgpu_device *adev)
> >   	mutex_lock(&con->recovery_lock);
> >   	con->eh_data = NULL;
> >   	kfree(data->bps);
> > +	kfree(data->bps_bo);
> >   	kfree(data);
> >   	mutex_unlock(&con->recovery_lock);
> >
> > @@ -1611,9 +1622,6 @@ int amdgpu_ras_init(struct amdgpu_device
> *adev)
> >   			return r;
> >   	}
> >
> > -	if (amdgpu_ras_recovery_init(adev))
> > -		goto recovery_out;
> > -
> >   	amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
> >
> >   	if (amdgpu_ras_fs_init(adev))
> > @@ -1628,8 +1636,6 @@ int amdgpu_ras_init(struct amdgpu_device
> *adev)
> >   			con->hw_supported, con->supported);
> >   	return 0;
> >   fs_out:
> > -	amdgpu_ras_recovery_fini(adev);
> > -recovery_out:
> >   	amdgpu_ras_set_context(adev, NULL);
> >   	kfree(con);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > index 42e1d379e21c..6d00f79b612b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > @@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct
> amdgpu_device *adev,
> >   	return ras && (ras->supported & (1 << block));
> >   }
> >
> > +int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
> >   int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
> >   		unsigned int block);
> >
> > @@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct
> amdgpu_device *adev,
> >   {
> >   	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> >
> > +	/* save bad page to eeprom before gpu reset,
> > +	 * i2c may be unstable in gpu reset
> > +	 */
> > +	amdgpu_ras_reserve_bad_pages(adev);
> >   	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
> >   		schedule_work(&ras->recovery_work);
> >   	return 0;
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS
       [not found]         ` <SN6PR12MB2813A05D3E8BCC723AE50308F1BE0-kxOKjb6HO/Hw8A9fYknAbAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2019-09-02  3:00           ` Zhou1, Tao
  0 siblings, 0 replies; 12+ messages in thread
From: Zhou1, Tao @ 2019-09-02  3:00 UTC (permalink / raw)
  To: Chen, Guchun, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Grodzovsky, Andrey, Li, Dennis, Zhang, Hawking



> -----Original Message-----
> From: Chen, Guchun <Guchun.Chen@amd.com>
> Sent: 2019年9月2日 10:11
> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org;
> Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Li, Dennis
> <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
> Subject: RE: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS
> 
> 
> 
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tao
> Zhou
> Sent: Friday, August 30, 2019 8:25 PM
> To: amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey
> <Andrey.Grodzovsky@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>;
> Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking
> <Hawking.Zhang@amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
> Subject: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS
> 
> support eeprom records load and save for ras, move EEPROM records
> storing to bad page reserving
> 
> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 111 ++++++++++++++++++--
> ----
>  1 file changed, 83 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 24663ec41248..02120aa3cb5d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1348,6 +1348,72 @@ int amdgpu_ras_add_bad_pages(struct
> amdgpu_device *adev,
>  	return ret;
>  }
> 
> +/*
> + * write error record array to eeprom, the function should be
> + * protected by recovery_lock
> + */
> +static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) {
> +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +	struct ras_err_handler_data *data;
> +	struct amdgpu_ras_eeprom_control *control =
> +					&adev->psp.ras.ras->eeprom_control;
> +	int save_count;
> +
> +	if (!con || !con->eh_data)
> +		return 0;
> +
> +	data = con->eh_data;
> +	if (!data)
> +		return 0;
> [Guchun]Such check (!data) is redundant and not needed. As we have
> checked !con->eh_data earlier, and the whole function is protected by
> recovery_lock.

[Tao] OK, I'll remove it.

> 
> +	save_count = data->count - control->num_recs;
> +	/* only new entries are saved */
> +	if (save_count > 0)
> +		if (amdgpu_ras_eeprom_process_recods(&con-
> >eeprom_control,
> +							&data->bps[control-
> >num_recs],
> +							true,
> +							save_count)) {
> +			DRM_ERROR("Failed to save EEPROM table data!");
> +			return -EIO;
> +		}
> +
> +	return 0;
> +}
> +
> +/*
> + * read error record array in eeprom and reserve enough space for
> + * storing new bad pages
> + */
> +static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) {
> +	struct amdgpu_ras_eeprom_control *control =
> +					&adev->psp.ras.ras->eeprom_control;
> +	struct eeprom_table_record *bps = NULL;
> +	int ret = 0;
> +
> +	/* no bad page record, skip eeprom access */
> +	if (!control->num_recs)
> +		return ret;
> +
> +	bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
> +	if (!bps)
> +		return -ENOMEM;
> +
> +	if (amdgpu_ras_eeprom_process_recods(control, bps, false,
> +		control->num_recs)) {
> +		DRM_ERROR("Failed to load EEPROM table records!");
> +		ret = -EIO;
> +		goto out;
> +	}
> +
> +	ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
> +
> +out:
> +	kfree(bps);
> +	return ret;
> +}
> +
>  /* called in gpu recovery/init */
>  int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)  { @@ -
> 1355,7 +1421,7 @@ int amdgpu_ras_reserve_bad_pages(struct
> amdgpu_device *adev)
>  	struct ras_err_handler_data *data;
>  	uint64_t bp;
>  	struct amdgpu_bo *bo;
> -	int i;
> +	int i, ret = 0;
> 
>  	if (!con || !con->eh_data)
>  		return 0;
> @@ -1375,9 +1441,11 @@ int amdgpu_ras_reserve_bad_pages(struct
> amdgpu_device *adev)
>  		data->bps_bo[i] = bo;
>  		data->last_reserved = i + 1;
>  	}
> +
> +	ret = amdgpu_ras_save_bad_pages(adev);
>  out:
>  	mutex_unlock(&con->recovery_lock);
> -	return 0;
> +	return ret;
>  }
> 
>  /* called when driver unload */
> @@ -1409,33 +1477,11 @@ static int amdgpu_ras_release_bad_pages(struct
> amdgpu_device *adev)
>  	return 0;
>  }
> 
> -static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) -{
> -	/* TODO
> -	 * write the array to eeprom when SMU disabled.
> -	 */
> -	return 0;
> -}
> -
> -/*
> - * read error record array in eeprom and reserve enough space for
> - * storing new bad pages
> - */
> -static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) -{
> -	struct eeprom_table_record *bps = NULL;
> -	int ret;
> -
> -	ret = amdgpu_ras_add_bad_pages(adev, bps,
> -				adev->umc.max_ras_err_cnt_per_query);
> -
> -	return ret;
> -}
> -
>  static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  {
>  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>  	struct ras_err_handler_data **data = &con->eh_data;
> +	int ret;
> 
>  	*data = kmalloc(sizeof(**data),
>  			GFP_KERNEL|__GFP_ZERO);
> @@ -1447,8 +1493,18 @@ static int amdgpu_ras_recovery_init(struct
> amdgpu_device *adev)
>  	atomic_set(&con->in_recovery, 0);
>  	con->adev = adev;
> 
> -	amdgpu_ras_load_bad_pages(adev);
> -	amdgpu_ras_reserve_bad_pages(adev);
> +	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras-
> >eeprom_control);
> +	if (ret)
> +		return ret;
> +
> +	if (adev->psp.ras.ras->eeprom_control.num_recs) {
> +		ret = amdgpu_ras_load_bad_pages(adev);
> +		if (ret)
> +			return ret;
> +		ret = amdgpu_ras_reserve_bad_pages(adev);
> +		if (ret)
> +			return ret;
> +	}
> 
>  	return 0;
>  }
> @@ -1459,7 +1515,6 @@ static int amdgpu_ras_recovery_fini(struct
> amdgpu_device *adev)
>  	struct ras_err_handler_data *data = con->eh_data;
> 
>  	cancel_work_sync(&con->recovery_work);
> -	amdgpu_ras_save_bad_pages(adev);
>  	amdgpu_ras_release_bad_pages(adev);
> 
>  	mutex_lock(&con->recovery_lock);
> --
> 2.17.1
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure
       [not found]         ` <SN6PR12MB2813C145D6004891FAF398B5F1BE0-kxOKjb6HO/Hw8A9fYknAbAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2019-09-02  3:14           ` Zhou1, Tao
  0 siblings, 0 replies; 12+ messages in thread
From: Zhou1, Tao @ 2019-09-02  3:14 UTC (permalink / raw)
  To: Chen, Guchun, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Grodzovsky, Andrey, Li, Dennis, Zhang, Hawking



> -----Original Message-----
> From: Chen, Guchun <Guchun.Chen@amd.com>
> Sent: 2019年9月2日 10:13
> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org;
> Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Li, Dennis
> <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
> Subject: RE: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table
> record structure
> 
> 
> 
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tao
> Zhou
> Sent: Friday, August 30, 2019 8:25 PM
> To: amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey
> <Andrey.Grodzovsky@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>;
> Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking
> <Hawking.Zhang@amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
> Subject: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table
> record structure
> 
> change bps type from retired page to eeprom table record, prepare for
> saving error records to eeprom
> 
> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 59 ++++++++++++++++-------
> --  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 11 +++--
>  2 files changed, 43 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 2ca3997d4b3a..24663ec41248 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1187,14 +1187,14 @@ static int amdgpu_ras_badpages_read(struct
> amdgpu_device *adev,
> 
>  	for (; i < data->count; i++) {
>  		(*bps)[i] = (struct ras_badpage){
> -			.bp = data->bps[i].bp,
> +			.bp = data->bps[i].retired_page,
>  			.size = AMDGPU_GPU_PAGE_SIZE,
>  			.flags = 0,
>  		};
> 
>  		if (data->last_reserved <= i)
>  			(*bps)[i].flags = 1;
> -		else if (data->bps[i].bo == NULL)
> +		else if (data->bps_bo[i] == NULL)
>  			(*bps)[i].flags = 2;
>  	}
> 
> @@ -1288,30 +1288,40 @@ static int
> amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,  {
>  	unsigned int old_space = data->count + data->space_left;
>  	unsigned int new_space = old_space + pages;
> -	unsigned int align_space = ALIGN(new_space, 1024);
> -	void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
> -
> -	if (!tmp)
> +	unsigned int align_space = ALIGN(new_space, 512);
> [Guchun]Any special reason to change alignment from 512 to 1024?

[Tao] The old "data->bps" is 16 byte and new " struct eeprom_table_record bps" is 31 bytes on 64bit system, I'd like to lower the pressure on memory system. The value can be adjusted according to feedback in the future.

> 
> +	void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
> +	struct amdgpu_bo **bps_bo =
> +			kmalloc(align_space * sizeof(*data->bps_bo),
> GFP_KERNEL);
> +
> +	if (!bps || !bps_bo) {
> +		kfree(bps);
> +		kfree(bps_bo);
>  		return -ENOMEM;
> +	}
> 
>  	if (data->bps) {
> -		memcpy(tmp, data->bps,
> +		memcpy(bps, data->bps,
>  				data->count * sizeof(*data->bps));
>  		kfree(data->bps);
>  	}
> +	if (data->bps_bo) {
> +		memcpy(bps_bo, data->bps_bo,
> +				data->count * sizeof(*data->bps_bo));
> +		kfree(data->bps_bo);
> +	}
> 
> -	data->bps = tmp;
> +	data->bps = bps;
> +	data->bps_bo = bps_bo;
>  	data->space_left += align_space - old_space;
>  	return 0;
>  }
> 
>  /* it deal with vram only. */
>  int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
> -		unsigned long *bps, int pages)
> +		struct eeprom_table_record *bps, int pages)
>  {
>  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>  	struct ras_err_handler_data *data;
> -	int i = pages;
>  	int ret = 0;
> 
>  	if (!con || !con->eh_data || !bps || pages <= 0) @@ -1328,10
> +1338,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>  			goto out;
>  		}
> 
> -	while (i--)
> -		data->bps[data->count++].bp = bps[i];
> -
> +	memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
> +	data->count += pages;
>  	data->space_left -= pages;
> +
>  out:
>  	mutex_unlock(&con->recovery_lock);
> 
> @@ -1356,13 +1366,13 @@ int amdgpu_ras_reserve_bad_pages(struct
> amdgpu_device *adev)
>  		goto out;
>  	/* reserve vram at driver post stage. */
>  	for (i = data->last_reserved; i < data->count; i++) {
> -		bp = data->bps[i].bp;
> +		bp = data->bps[i].retired_page;
> 
>  		if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
>  					PAGE_SIZE, &bo))
>  			DRM_ERROR("RAS ERROR: reserve vram %llx fail\n",
> bp);
> 
> -		data->bps[i].bo = bo;
> +		data->bps_bo[i] = bo;
>  		data->last_reserved = i + 1;
>  	}
>  out:
> @@ -1387,11 +1397,11 @@ static int amdgpu_ras_release_bad_pages(struct
> amdgpu_device *adev)
>  		goto out;
> 
>  	for (i = data->last_reserved - 1; i >= 0; i--) {
> -		bo = data->bps[i].bo;
> +		bo = data->bps_bo[i];
> 
>  		amdgpu_ras_release_vram(adev, &bo);
> 
> -		data->bps[i].bo = bo;
> +		data->bps_bo[i] = bo;
>  		data->last_reserved = i;
>  	}
>  out:
> @@ -1407,12 +1417,19 @@ static int amdgpu_ras_save_bad_pages(struct
> amdgpu_device *adev)
>  	return 0;
>  }
> 
> +/*
> + * read error record array in eeprom and reserve enough space for
> + * storing new bad pages
> + */
>  static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)  {
> -	/* TODO
> -	 * read the array to eeprom when SMU disabled.
> -	 */
> -	return 0;
> +	struct eeprom_table_record *bps = NULL;
> +	int ret;
> +
> +	ret = amdgpu_ras_add_bad_pages(adev, bps,
> +				adev->umc.max_ras_err_cnt_per_query);
> +
> +	return ret;
>  }
> 
>  static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 66b71525446e..b6bac873c588 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -351,11 +351,10 @@ struct ras_err_data {  };
> 
>  struct ras_err_handler_data {
> -	/* point to bad pages array */
> -	struct {
> -		unsigned long bp;
> -		struct amdgpu_bo *bo;
> -	} *bps;
> +	/* point to bad page records array */
> +	struct eeprom_table_record *bps;
> +	/* point to reserved bo array */
> +	struct amdgpu_bo **bps_bo;
>  	/* the count of entries */
>  	int count;
>  	/* the space can place new entries */
> @@ -492,7 +491,7 @@ unsigned long
> amdgpu_ras_query_error_count(struct amdgpu_device *adev,
> 
>  /* error handling functions */
>  int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
> -		unsigned long *bps, int pages);
> +		struct eeprom_table_record *bps, int pages);
> 
>  int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev);
> 
> --
> 2.17.1
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2019-09-02  3:14 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-30 12:24 [PATCH 0/4] add support for ras page retirement Tao Zhou
     [not found] ` <20190830122453.19703-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
2019-08-30 12:24   ` [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure Tao Zhou
     [not found]     ` <20190830122453.19703-2-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
2019-09-02  2:13       ` Chen, Guchun
     [not found]         ` <SN6PR12MB2813C145D6004891FAF398B5F1BE0-kxOKjb6HO/Hw8A9fYknAbAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-09-02  3:14           ` Zhou1, Tao
2019-08-30 12:24   ` [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS Tao Zhou
     [not found]     ` <20190830122453.19703-3-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
2019-09-02  2:11       ` Chen, Guchun
     [not found]         ` <SN6PR12MB2813A05D3E8BCC723AE50308F1BE0-kxOKjb6HO/Hw8A9fYknAbAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-09-02  3:00           ` Zhou1, Tao
2019-08-30 12:24   ` [PATCH 3/4] drm/amdgpu: save umc error records Tao Zhou
2019-08-30 12:24   ` [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place Tao Zhou
     [not found]     ` <20190830122453.19703-5-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
2019-08-30 14:03       ` Grodzovsky, Andrey
     [not found]         ` <d70f5672-2d8e-8efe-7b08-9df1c87f98ba-5C7GfCeVMHo@public.gmane.org>
2019-09-02  2:58           ` Zhou1, Tao
2019-09-02  2:25   ` [PATCH 0/4] add support for ras page retirement Chen, Guchun

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.