All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure
@ 2019-09-05  4:03 Zhou1, Tao
       [not found] ` <20190905040324.18741-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Zhou1, Tao @ 2019-09-05  4:03 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Grodzovsky, Andrey,
	Chen, Guchun, Li, Dennis, Zhang, Hawking, Clements, John
  Cc: Zhou1, Tao

change bps type from retired page to eeprom table record, prepare for
saving umc error records to eeprom

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Guchun Chen <guchun.chen@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 59 ++++++++++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 11 +++--
 2 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5c2276bb8325..c6f4c01b98a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1203,14 +1203,14 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
 
 	for (; i < data->count; i++) {
 		(*bps)[i] = (struct ras_badpage){
-			.bp = data->bps[i].bp,
+			.bp = data->bps[i].retired_page,
 			.size = AMDGPU_GPU_PAGE_SIZE,
 			.flags = 0,
 		};
 
 		if (data->last_reserved <= i)
 			(*bps)[i].flags = 1;
-		else if (data->bps[i].bo == NULL)
+		else if (data->bps_bo[i] == NULL)
 			(*bps)[i].flags = 2;
 	}
 
@@ -1304,30 +1304,40 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
 {
 	unsigned int old_space = data->count + data->space_left;
 	unsigned int new_space = old_space + pages;
-	unsigned int align_space = ALIGN(new_space, 1024);
-	void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
-
-	if (!tmp)
+	unsigned int align_space = ALIGN(new_space, 512);
+	void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
+	struct amdgpu_bo **bps_bo =
+			kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL);
+
+	if (!bps || !bps_bo) {
+		kfree(bps);
+		kfree(bps_bo);
 		return -ENOMEM;
+	}
 
 	if (data->bps) {
-		memcpy(tmp, data->bps,
+		memcpy(bps, data->bps,
 				data->count * sizeof(*data->bps));
 		kfree(data->bps);
 	}
+	if (data->bps_bo) {
+		memcpy(bps_bo, data->bps_bo,
+				data->count * sizeof(*data->bps_bo));
+		kfree(data->bps_bo);
+	}
 
-	data->bps = tmp;
+	data->bps = bps;
+	data->bps_bo = bps_bo;
 	data->space_left += align_space - old_space;
 	return 0;
 }
 
 /* it deal with vram only. */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
-		unsigned long *bps, int pages)
+		struct eeprom_table_record *bps, int pages)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data *data;
-	int i = pages;
 	int ret = 0;
 
 	if (!con || !con->eh_data || !bps || pages <= 0)
@@ -1344,10 +1354,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 			goto out;
 		}
 
-	while (i--)
-		data->bps[data->count++].bp = bps[i];
-
+	memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
+	data->count += pages;
 	data->space_left -= pages;
+
 out:
 	mutex_unlock(&con->recovery_lock);
 
@@ -1372,13 +1382,13 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 		goto out;
 	/* reserve vram at driver post stage. */
 	for (i = data->last_reserved; i < data->count; i++) {
-		bp = data->bps[i].bp;
+		bp = data->bps[i].retired_page;
 
 		if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
 					PAGE_SIZE, &bo))
 			DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
 
-		data->bps[i].bo = bo;
+		data->bps_bo[i] = bo;
 		data->last_reserved = i + 1;
 	}
 out:
@@ -1403,11 +1413,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
 		goto out;
 
 	for (i = data->last_reserved - 1; i >= 0; i--) {
-		bo = data->bps[i].bo;
+		bo = data->bps_bo[i];
 
 		amdgpu_ras_release_vram(adev, &bo);
 
-		data->bps[i].bo = bo;
+		data->bps_bo[i] = bo;
 		data->last_reserved = i;
 	}
 out:
@@ -1423,12 +1433,19 @@ static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
 	return 0;
 }
 
+/*
+ * read error record array in eeprom and reserve enough space for
+ * storing new bad pages
+ */
 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
 {
-	/* TODO
-	 * read the array to eeprom when SMU disabled.
-	 */
-	return 0;
+	struct eeprom_table_record *bps = NULL;
+	int ret;
+
+	ret = amdgpu_ras_add_bad_pages(adev, bps,
+				adev->umc.max_ras_err_cnt_per_query);
+
+	return ret;
 }
 
 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index f487038ba331..bc1d45971607 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -351,11 +351,10 @@ struct ras_err_data {
 };
 
 struct ras_err_handler_data {
-	/* point to bad pages array */
-	struct {
-		unsigned long bp;
-		struct amdgpu_bo *bo;
-	} *bps;
+	/* point to bad page records array */
+	struct eeprom_table_record *bps;
+	/* point to reserved bo array */
+	struct amdgpu_bo **bps_bo;
 	/* the count of entries */
 	int count;
 	/* the space can place new entries */
@@ -492,7 +491,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
-		unsigned long *bps, int pages);
+		struct eeprom_table_record *bps, int pages);
 
 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev);
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS
       [not found] ` <20190905040324.18741-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
@ 2019-09-05  4:03   ` Zhou1, Tao
  2019-09-05  4:03   ` [PATCH 3/4] drm/amdgpu: save umc error records Zhou1, Tao
  2019-09-05  4:04   ` [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place Zhou1, Tao
  2 siblings, 0 replies; 8+ messages in thread
From: Zhou1, Tao @ 2019-09-05  4:03 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Grodzovsky, Andrey,
	Chen, Guchun, Li, Dennis, Zhang, Hawking, Clements, John
  Cc: Zhou1, Tao

support eeprom records load and save for ras,
move EEPROM records storing to bad page reserving

v2: remove redundant check for con->eh_data

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: Guchun Chen <guchun.chen@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 109 ++++++++++++++++++------
 1 file changed, 81 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c6f4c01b98a8..e68f43d1cfea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1364,6 +1364,69 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 	return ret;
 }
 
+/*
+ * write error record array to eeprom, the function should be
+ * protected by recovery_lock
+ */
+static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_err_handler_data *data;
+	struct amdgpu_ras_eeprom_control *control =
+					&adev->psp.ras.ras->eeprom_control;
+	int save_count;
+
+	if (!con || !con->eh_data)
+		return 0;
+
+	data = con->eh_data;
+	save_count = data->count - control->num_recs;
+	/* only new entries are saved */
+	if (save_count > 0)
+		if (amdgpu_ras_eeprom_process_recods(&con->eeprom_control,
+							&data->bps[control->num_recs],
+							true,
+							save_count)) {
+			DRM_ERROR("Failed to save EEPROM table data!");
+			return -EIO;
+		}
+
+	return 0;
+}
+
+/*
+ * read error record array in eeprom and reserve enough space for
+ * storing new bad pages
+ */
+static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras_eeprom_control *control =
+					&adev->psp.ras.ras->eeprom_control;
+	struct eeprom_table_record *bps = NULL;
+	int ret = 0;
+
+	/* no bad page record, skip eeprom access */
+	if (!control->num_recs)
+		return ret;
+
+	bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
+	if (!bps)
+		return -ENOMEM;
+
+	if (amdgpu_ras_eeprom_process_recods(control, bps, false,
+		control->num_recs)) {
+		DRM_ERROR("Failed to load EEPROM table records!");
+		ret = -EIO;
+		goto out;
+	}
+
+	ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
+
+out:
+	kfree(bps);
+	return ret;
+}
+
 /* called in gpu recovery/init */
 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 {
@@ -1371,7 +1434,7 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 	struct ras_err_handler_data *data;
 	uint64_t bp;
 	struct amdgpu_bo *bo;
-	int i;
+	int i, ret = 0;
 
 	if (!con || !con->eh_data)
 		return 0;
@@ -1391,9 +1454,12 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 		data->bps_bo[i] = bo;
 		data->last_reserved = i + 1;
 	}
+
+	/* continue to save bad pages to eeprom even reesrve_vram fails */
+	ret = amdgpu_ras_save_bad_pages(adev);
 out:
 	mutex_unlock(&con->recovery_lock);
-	return 0;
+	return ret;
 }
 
 /* called when driver unload */
@@ -1425,33 +1491,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
 	return 0;
 }
 
-static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
-{
-	/* TODO
-	 * write the array to eeprom when SMU disabled.
-	 */
-	return 0;
-}
-
-/*
- * read error record array in eeprom and reserve enough space for
- * storing new bad pages
- */
-static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
-{
-	struct eeprom_table_record *bps = NULL;
-	int ret;
-
-	ret = amdgpu_ras_add_bad_pages(adev, bps,
-				adev->umc.max_ras_err_cnt_per_query);
-
-	return ret;
-}
-
 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data **data = &con->eh_data;
+	int ret;
 
 	*data = kmalloc(sizeof(**data),
 			GFP_KERNEL|__GFP_ZERO);
@@ -1463,8 +1507,18 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 	atomic_set(&con->in_recovery, 0);
 	con->adev = adev;
 
-	amdgpu_ras_load_bad_pages(adev);
-	amdgpu_ras_reserve_bad_pages(adev);
+	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
+	if (ret)
+		return ret;
+
+	if (adev->psp.ras.ras->eeprom_control.num_recs) {
+		ret = amdgpu_ras_load_bad_pages(adev);
+		if (ret)
+			return ret;
+		ret = amdgpu_ras_reserve_bad_pages(adev);
+		if (ret)
+			return ret;
+	}
 
 	return 0;
 }
@@ -1475,7 +1529,6 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
 	struct ras_err_handler_data *data = con->eh_data;
 
 	cancel_work_sync(&con->recovery_work);
-	amdgpu_ras_save_bad_pages(adev);
 	amdgpu_ras_release_bad_pages(adev);
 
 	mutex_lock(&con->recovery_lock);
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 3/4] drm/amdgpu: save umc error records
       [not found] ` <20190905040324.18741-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
  2019-09-05  4:03   ` [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS Zhou1, Tao
@ 2019-09-05  4:03   ` Zhou1, Tao
  2019-09-05  4:04   ` [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place Zhou1, Tao
  2 siblings, 0 replies; 8+ messages in thread
From: Zhou1, Tao @ 2019-09-05  4:03 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Grodzovsky, Andrey,
	Chen, Guchun, Li, Dennis, Zhang, Hawking, Clements, John
  Cc: Zhou1, Tao

save umc error records to ras bad page array

v2: add bad pages before gpu reset
v3: add NULL check for adev->umc.funcs

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: Guchun Chen <guchun.chen@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 40 +++++++++++++++++++------
 drivers/gpu/drm/amd/amdgpu/umc_v6_1.c   | 39 +++++++++++++++++++-----
 3 files changed, 64 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index bc1d45971607..96210e18191e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -347,7 +347,7 @@ struct ras_err_data {
 	unsigned long ue_count;
 	unsigned long ce_count;
 	unsigned long err_addr_cnt;
-	uint64_t *err_addr;
+	struct eeprom_table_record *err_addr;
 };
 
 struct ras_err_handler_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 9d15679df6e0..beb6c84ab9e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -243,21 +243,43 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
 		struct ras_err_data *err_data,
 		struct amdgpu_iv_entry *entry)
 {
-	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
-		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
-		if (adev->umc.funcs->query_ras_error_count)
-			adev->umc.funcs->query_ras_error_count(adev, err_data);
+	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
+		return AMDGPU_RAS_SUCCESS;
+
+	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+	if (adev->umc.funcs &&
+	    adev->umc.funcs->query_ras_error_count)
+	    adev->umc.funcs->query_ras_error_count(adev, err_data);
+
+	if (adev->umc.funcs &&
+	    adev->umc.funcs->query_ras_error_address &&
+	    adev->umc.max_ras_err_cnt_per_query) {
+		err_data->err_addr =
+			kcalloc(adev->umc.max_ras_err_cnt_per_query,
+				sizeof(struct eeprom_table_record), GFP_KERNEL);
+		/* still call query_ras_error_address to clear error status
+		 * even NOMEM error is encountered
+		 */
+		if(!err_data->err_addr)
+			DRM_WARN("Failed to alloc memory for umc error address record!\n");
+
 		/* umc query_ras_error_address is also responsible for clearing
 		 * error status
 		 */
-		if (adev->umc.funcs->query_ras_error_address)
-			adev->umc.funcs->query_ras_error_address(adev, err_data);
+		adev->umc.funcs->query_ras_error_address(adev, err_data);
+	}
+
+	/* only uncorrectable error needs gpu reset */
+	if (err_data->ue_count) {
+		if (err_data->err_addr_cnt &&
+		    amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
+						err_data->err_addr_cnt))
+			DRM_WARN("Failed to add ras bad page!\n");
 
-		/* only uncorrectable error needs gpu reset */
-		if (err_data->ue_count)
-			amdgpu_ras_reset_gpu(adev, 0);
+		amdgpu_ras_reset_gpu(adev, 0);
 	}
 
+	kfree(err_data->err_addr);
 	return AMDGPU_RAS_SUCCESS;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
index 8502e736f721..09e316a22f1a 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
@@ -75,6 +75,17 @@ static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev)
 			RSMU_UMC_INDEX_MODE_EN, 0);
 }
 
+static uint32_t umc_v6_1_get_umc_inst(struct amdgpu_device *adev)
+{
+	uint32_t rsmu_umc_index;
+
+	rsmu_umc_index = RREG32_SOC15(RSMU, 0,
+				mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU);
+	return REG_GET_FIELD(rsmu_umc_index,
+				RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
+				RSMU_UMC_INDEX_INSTANCE);
+}
+
 static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
 						   uint32_t umc_reg_offset,
 						   unsigned long *error_count)
@@ -165,7 +176,8 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
 					 uint32_t umc_reg_offset, uint32_t channel_index)
 {
 	uint32_t lsb, mc_umc_status_addr;
-	uint64_t mc_umc_status, err_addr;
+	uint64_t mc_umc_status, err_addr, retired_page;
+	struct eeprom_table_record *err_rec;
 
 	mc_umc_status_addr =
 		SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
@@ -177,6 +189,7 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
 		return;
 	}
 
+	err_rec = &err_data->err_addr[err_data->err_addr_cnt];
 	mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset);
 
 	/* calculate error address if ue/ce error is detected */
@@ -191,12 +204,24 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
 		err_addr &= ~((0x1ULL << lsb) - 1);
 
 		/* translate umc channel address to soc pa, 3 parts are included */
-		err_data->err_addr[err_data->err_addr_cnt] =
-						ADDR_OF_8KB_BLOCK(err_addr) |
-						ADDR_OF_256B_BLOCK(channel_index) |
-						OFFSET_IN_256B_BLOCK(err_addr);
-
-		err_data->err_addr_cnt++;
+		retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
+				ADDR_OF_256B_BLOCK(channel_index) |
+				OFFSET_IN_256B_BLOCK(err_addr);
+
+		/* we only save ue error information currently, ce is skipped */
+		if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
+				== 1) {
+			err_rec->address = err_addr;
+			/* page frame address is saved */
+			err_rec->retired_page = retired_page >> PAGE_SHIFT;
+			err_rec->ts = (uint64_t)ktime_get_real_seconds();
+			err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
+			err_rec->cu = 0;
+			err_rec->mem_channel = channel_index;
+			err_rec->mcumc_id = umc_v6_1_get_umc_inst(adev);
+
+			err_data->err_addr_cnt++;
+		}
 	}
 
 	/* clear umc status */
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place
       [not found] ` <20190905040324.18741-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
  2019-09-05  4:03   ` [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS Zhou1, Tao
  2019-09-05  4:03   ` [PATCH 3/4] drm/amdgpu: save umc error records Zhou1, Tao
@ 2019-09-05  4:04   ` Zhou1, Tao
       [not found]     ` <20190905040324.18741-4-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
  2 siblings, 1 reply; 8+ messages in thread
From: Zhou1, Tao @ 2019-09-05  4:04 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Grodzovsky, Andrey,
	Chen, Guchun, Li, Dennis, Zhang, Hawking, Clements, John
  Cc: Zhou1, Tao

ras recovery_init should be called after ttm init,
bad page reserve should be put in front of gpu reset since i2c
may be unstable during gpu reset.
add cleanup for recovery_init and recovery_fini

v2: add more comment and print.
    remove cancel_work_sync in recovery_init.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Guchun Chen <guchun.chen@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 39 ++++++++++++++--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  5 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c    | 12 +++++++
 4 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e50861e16cf5..22cd3deab731 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3629,11 +3629,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 						break;
 				}
 			}
-
-			list_for_each_entry(tmp_adev, device_list_handle,
-					gmc.xgmi.head) {
-				amdgpu_ras_reserve_bad_pages(tmp_adev);
-			}
 		}
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e68f43d1cfea..c2b2b0e3515c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1491,16 +1491,17 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
 	return 0;
 }
 
-static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data **data = &con->eh_data;
 	int ret;
 
-	*data = kmalloc(sizeof(**data),
-			GFP_KERNEL|__GFP_ZERO);
-	if (!*data)
-		return -ENOMEM;
+	*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
+	if (!*data) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	mutex_init(&con->recovery_lock);
 	INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
@@ -1509,18 +1510,30 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 
 	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
 	if (ret)
-		return ret;
+		goto free;
 
 	if (adev->psp.ras.ras->eeprom_control.num_recs) {
 		ret = amdgpu_ras_load_bad_pages(adev);
 		if (ret)
-			return ret;
+			goto free;
 		ret = amdgpu_ras_reserve_bad_pages(adev);
 		if (ret)
-			return ret;
+			goto release;
 	}
 
 	return 0;
+
+release:
+	amdgpu_ras_release_bad_pages(adev);
+free:
+	con->eh_data = NULL;
+	kfree((*data)->bps);
+	kfree((*data)->bps_bo);
+	kfree(*data);
+out:
+	DRM_WARN("Failed to initilaize ras recovery!\n");
+
+	return ret;
 }
 
 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
@@ -1528,12 +1541,17 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data *data = con->eh_data;
 
+	/* recovery_init failed to init it, fini is useless */
+	if (!data)
+		return 0;
+
 	cancel_work_sync(&con->recovery_work);
 	amdgpu_ras_release_bad_pages(adev);
 
 	mutex_lock(&con->recovery_lock);
 	con->eh_data = NULL;
 	kfree(data->bps);
+	kfree(data->bps_bo);
 	kfree(data);
 	mutex_unlock(&con->recovery_lock);
 
@@ -1625,9 +1643,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 			return r;
 	}
 
-	if (amdgpu_ras_recovery_init(adev))
-		goto recovery_out;
-
 	amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
 
 	if (amdgpu_ras_fs_init(adev))
@@ -1642,8 +1657,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 			con->hw_supported, con->supported);
 	return 0;
 fs_out:
-	amdgpu_ras_recovery_fini(adev);
-recovery_out:
 	amdgpu_ras_set_context(adev, NULL);
 	kfree(con);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 96210e18191e..012034d2ae06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
 	return ras && (ras->supported & (1 << block));
 }
 
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
 		unsigned int block);
 
@@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,
 {
 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
+	/* save bad page to eeprom before gpu reset,
+	 * i2c may be unstable in gpu reset
+	 */
+	amdgpu_ras_reserve_bad_pages(adev);
 	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
 		schedule_work(&ras->recovery_work);
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index dcd32d01a579..c05638cf3f3d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -49,6 +49,7 @@
 #include "amdgpu_trace.h"
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_sdma.h"
+#include "amdgpu_ras.h"
 #include "bif/bif_4_1_d.h"
 
 static int amdgpu_map_buffer(struct ttm_buffer_object *bo,
@@ -1772,6 +1773,17 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
 						adev->gmc.visible_vram_size);
 #endif
 
+	/*
+	 * retired pages will be loaded from eeprom and reserved here,
+	 * it should be called after ttm init since new bo may be created,
+	 * recovery_init may fail, but it can free all resources allocated by
+	 * itself and its failure should not stop amdgpu init process.
+	 *
+	 * Note: theoretically, this should be called before all vram allocations
+	 * to protect retired page from abusing
+	 */
+	amdgpu_ras_recovery_init(adev);
+
 	/*
 	 *The reserved vram for firmware must be pinned to the specified
 	 *place on the VRAM, so reserve it early.
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* RE: [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place
       [not found]     ` <20190905040324.18741-4-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
@ 2019-09-05  7:56       ` Chen, Guchun
  0 siblings, 0 replies; 8+ messages in thread
From: Chen, Guchun @ 2019-09-05  7:56 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Grodzovsky,
	Andrey, Li, Dennis, Zhang, Hawking, Clements, John

Except one spelling typo, series is: Reviewed-by: Guchun Chen <guchun.chen@amd.com>

Regards,
Guchun

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com> 
Sent: Thursday, September 5, 2019 12:04 PM
To: amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Clements, John <John.Clements@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place

ras recovery_init should be called after ttm init, bad page reserve should be put in front of gpu reset since i2c may be unstable during gpu reset.
add cleanup for recovery_init and recovery_fini

v2: add more comment and print.
    remove cancel_work_sync in recovery_init.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Guchun Chen <guchun.chen@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 39 ++++++++++++++--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  5 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c    | 12 +++++++
 4 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e50861e16cf5..22cd3deab731 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3629,11 +3629,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 						break;
 				}
 			}
-
-			list_for_each_entry(tmp_adev, device_list_handle,
-					gmc.xgmi.head) {
-				amdgpu_ras_reserve_bad_pages(tmp_adev);
-			}
 		}
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e68f43d1cfea..c2b2b0e3515c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1491,16 +1491,17 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
 	return 0;
 }
 
-static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data **data = &con->eh_data;
 	int ret;
 
-	*data = kmalloc(sizeof(**data),
-			GFP_KERNEL|__GFP_ZERO);
-	if (!*data)
-		return -ENOMEM;
+	*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
+	if (!*data) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	mutex_init(&con->recovery_lock);
 	INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); @@ -1509,18 +1510,30 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 
 	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
 	if (ret)
-		return ret;
+		goto free;
 
 	if (adev->psp.ras.ras->eeprom_control.num_recs) {
 		ret = amdgpu_ras_load_bad_pages(adev);
 		if (ret)
-			return ret;
+			goto free;
 		ret = amdgpu_ras_reserve_bad_pages(adev);
 		if (ret)
-			return ret;
+			goto release;
 	}
 
 	return 0;
+
+release:
+	amdgpu_ras_release_bad_pages(adev);
+free:
+	con->eh_data = NULL;
+	kfree((*data)->bps);
+	kfree((*data)->bps_bo);
+	kfree(*data);
+out:
+	DRM_WARN("Failed to initilaize ras recovery!\n");
[Guchun] One spelling typo of initialize.
+
+	return ret;
 }
 
 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) @@ -1528,12 +1541,17 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data *data = con->eh_data;
 
+	/* recovery_init failed to init it, fini is useless */
+	if (!data)
+		return 0;
+
 	cancel_work_sync(&con->recovery_work);
 	amdgpu_ras_release_bad_pages(adev);
 
 	mutex_lock(&con->recovery_lock);
 	con->eh_data = NULL;
 	kfree(data->bps);
+	kfree(data->bps_bo);
 	kfree(data);
 	mutex_unlock(&con->recovery_lock);
 
@@ -1625,9 +1643,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 			return r;
 	}
 
-	if (amdgpu_ras_recovery_init(adev))
-		goto recovery_out;
-
 	amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
 
 	if (amdgpu_ras_fs_init(adev))
@@ -1642,8 +1657,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 			con->hw_supported, con->supported);
 	return 0;
 fs_out:
-	amdgpu_ras_recovery_fini(adev);
-recovery_out:
 	amdgpu_ras_set_context(adev, NULL);
 	kfree(con);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 96210e18191e..012034d2ae06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
 	return ras && (ras->supported & (1 << block));  }
 
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
 		unsigned int block);
 
@@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,  {
 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
+	/* save bad page to eeprom before gpu reset,
+	 * i2c may be unstable in gpu reset
+	 */
+	amdgpu_ras_reserve_bad_pages(adev);
 	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
 		schedule_work(&ras->recovery_work);
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index dcd32d01a579..c05638cf3f3d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -49,6 +49,7 @@
 #include "amdgpu_trace.h"
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_sdma.h"
+#include "amdgpu_ras.h"
 #include "bif/bif_4_1_d.h"
 
 static int amdgpu_map_buffer(struct ttm_buffer_object *bo, @@ -1772,6 +1773,17 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
 						adev->gmc.visible_vram_size);
 #endif
 
+	/*
+	 * retired pages will be loaded from eeprom and reserved here,
+	 * it should be called after ttm init since new bo may be created,
+	 * recovery_init may fail, but it can free all resources allocated by
+	 * itself and its failure should not stop amdgpu init process.
+	 *
+	 * Note: theoretically, this should be called before all vram allocations
+	 * to protect retired page from abusing
+	 */
+	amdgpu_ras_recovery_init(adev);
+
 	/*
 	 *The reserved vram for firmware must be pinned to the specified
 	 *place on the VRAM, so reserve it early.
--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* RE: [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place
       [not found]         ` <d70f5672-2d8e-8efe-7b08-9df1c87f98ba-5C7GfCeVMHo@public.gmane.org>
@ 2019-09-02  2:58           ` Zhou1, Tao
  0 siblings, 0 replies; 8+ messages in thread
From: Zhou1, Tao @ 2019-09-02  2:58 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Chen, Guchun, Li, Dennis, Zhang, Hawking



> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Sent: 2019年8月30日 22:03
> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org;
> Chen, Guchun <Guchun.Chen@amd.com>; Li, Dennis <Dennis.Li@amd.com>;
> Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: Re: [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and
> bad page reserve to proper place
> 
> 
> On 8/30/19 8:24 AM, Tao Zhou wrote:
> > ras recovery_init should be called after ttm and smu init, bad page
> > reserve should be put in front of gpu reset since i2c may be unstable
> > during gpu reset add cleanup for recovery_init and recovery_fini
> >
> > Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++----
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 28 +++++++++++++-------
> --
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  5 ++++
> >   3 files changed, 33 insertions(+), 16 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 67b09cb2a9e2..4e4895ac926d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -2894,6 +2894,17 @@ int amdgpu_device_init(struct amdgpu_device
> *adev,
> >   		goto failed;
> >   	}
> >
> > +	/* retired pages will be loaded from eeprom and reserved here,
> > +	 * new bo may be created, it should be called after ttm init,
> > +	 * accessing eeprom also relies on smu (unlock i2c bus) and it
> > +	 * should be called after smu init
> > +	 *
> > +	 * Note: theoretically, this should be called before all vram allocations
> > +	 * to protect retired page from abusing, but there are some
> allocations
> > +	 * in front of smu fw loading
> > +	 */
> > +	amdgpu_ras_recovery_init(adev);
> 
> 
> You probably should check for return value here.
[Tao] No check here is intentional, the failure of recovery_init should not stop amdgpu init process and recovery_init could free resources allocated by itself if it fails. I'll add comment here and add a DRM_WARN for recovery_init.

> 
> 
> > +
> >   	/* must succeed. */
> >   	amdgpu_ras_resume(adev);
> >
> > @@ -3627,11 +3638,6 @@ static int amdgpu_do_asic_reset(struct
> amdgpu_hive_info *hive,
> >   						break;
> >   				}
> >   			}
> > -
> > -			list_for_each_entry(tmp_adev, device_list_handle,
> > -					gmc.xgmi.head) {
> > -				amdgpu_ras_reserve_bad_pages(tmp_adev);
> > -			}
> >   		}
> >   	}
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > index 02120aa3cb5d..ad67a109122f 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > @@ -1477,14 +1477,13 @@ static int
> amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
> >   	return 0;
> >   }
> >
> > -static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> > +int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> >   {
> >   	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> >   	struct ras_err_handler_data **data = &con->eh_data;
> >   	int ret;
> >
> > -	*data = kmalloc(sizeof(**data),
> > -			GFP_KERNEL|__GFP_ZERO);
> > +	*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
> >   	if (!*data)
> >   		return -ENOMEM;
> >
> > @@ -1495,18 +1494,29 @@ static int amdgpu_ras_recovery_init(struct
> > amdgpu_device *adev)
> >
> >   	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras-
> >eeprom_control);
> >   	if (ret)
> > -		return ret;
> > +		goto cancel_work;
> 
> 
> Why you need do go to cancel_work_sync(&con->recovery_work) at such
> early stage of device init, is RAS active already by this time and RAS interrupt
> might fire triggering reset  ?
> 
> Andrey
> 
[Tao] Good point, I'll remove cancel_work_sync.

> 
> >
> >   	if (adev->psp.ras.ras->eeprom_control.num_recs) {
> >   		ret = amdgpu_ras_load_bad_pages(adev);
> >   		if (ret)
> > -			return ret;
> > +			goto cancel_work;
> >   		ret = amdgpu_ras_reserve_bad_pages(adev);
> >   		if (ret)
> > -			return ret;
> > +			goto release;
> >   	}
> >
> >   	return 0;
> > +
> > +release:
> > +	amdgpu_ras_release_bad_pages(adev);
> > +cancel_work:
> > +	cancel_work_sync(&con->recovery_work);
> > +	con->eh_data = NULL;
> > +	kfree((*data)->bps);
> > +	kfree((*data)->bps_bo);
> > +	kfree(*data);
> > +
> > +	return ret;
> >   }
> >
> >   static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) @@
> > -1520,6 +1530,7 @@ static int amdgpu_ras_recovery_fini(struct
> amdgpu_device *adev)
> >   	mutex_lock(&con->recovery_lock);
> >   	con->eh_data = NULL;
> >   	kfree(data->bps);
> > +	kfree(data->bps_bo);
> >   	kfree(data);
> >   	mutex_unlock(&con->recovery_lock);
> >
> > @@ -1611,9 +1622,6 @@ int amdgpu_ras_init(struct amdgpu_device
> *adev)
> >   			return r;
> >   	}
> >
> > -	if (amdgpu_ras_recovery_init(adev))
> > -		goto recovery_out;
> > -
> >   	amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
> >
> >   	if (amdgpu_ras_fs_init(adev))
> > @@ -1628,8 +1636,6 @@ int amdgpu_ras_init(struct amdgpu_device
> *adev)
> >   			con->hw_supported, con->supported);
> >   	return 0;
> >   fs_out:
> > -	amdgpu_ras_recovery_fini(adev);
> > -recovery_out:
> >   	amdgpu_ras_set_context(adev, NULL);
> >   	kfree(con);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > index 42e1d379e21c..6d00f79b612b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > @@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct
> amdgpu_device *adev,
> >   	return ras && (ras->supported & (1 << block));
> >   }
> >
> > +int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
> >   int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
> >   		unsigned int block);
> >
> > @@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct
> amdgpu_device *adev,
> >   {
> >   	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> >
> > +	/* save bad page to eeprom before gpu reset,
> > +	 * i2c may be unstable in gpu reset
> > +	 */
> > +	amdgpu_ras_reserve_bad_pages(adev);
> >   	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
> >   		schedule_work(&ras->recovery_work);
> >   	return 0;
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place
       [not found]     ` <20190830122453.19703-5-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
@ 2019-08-30 14:03       ` Grodzovsky, Andrey
       [not found]         ` <d70f5672-2d8e-8efe-7b08-9df1c87f98ba-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Grodzovsky, Andrey @ 2019-08-30 14:03 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Chen,
	Guchun, Li, Dennis, Zhang, Hawking


On 8/30/19 8:24 AM, Tao Zhou wrote:
> ras recovery_init should be called after ttm and smu init,
> bad page reserve should be put in front of gpu reset since i2c
> may be unstable during gpu reset
> add cleanup for recovery_init and recovery_fini
>
> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 28 +++++++++++++---------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  5 ++++
>   3 files changed, 33 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 67b09cb2a9e2..4e4895ac926d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2894,6 +2894,17 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   		goto failed;
>   	}
>   
> +	/* retired pages will be loaded from eeprom and reserved here,
> +	 * new bo may be created, it should be called after ttm init,
> +	 * accessing eeprom also relies on smu (unlock i2c bus) and it
> +	 * should be called after smu init
> +	 *
> +	 * Note: theoretically, this should be called before all vram allocations
> +	 * to protect retired page from abusing, but there are some allocations
> +	 * in front of smu fw loading
> +	 */
> +	amdgpu_ras_recovery_init(adev);


You probably should check for return value here.


> +
>   	/* must succeed. */
>   	amdgpu_ras_resume(adev);
>   
> @@ -3627,11 +3638,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   						break;
>   				}
>   			}
> -
> -			list_for_each_entry(tmp_adev, device_list_handle,
> -					gmc.xgmi.head) {
> -				amdgpu_ras_reserve_bad_pages(tmp_adev);
> -			}
>   		}
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 02120aa3cb5d..ad67a109122f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1477,14 +1477,13 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
>   	return 0;
>   }
>   
> -static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> +int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
>   {
>   	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>   	struct ras_err_handler_data **data = &con->eh_data;
>   	int ret;
>   
> -	*data = kmalloc(sizeof(**data),
> -			GFP_KERNEL|__GFP_ZERO);
> +	*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
>   	if (!*data)
>   		return -ENOMEM;
>   
> @@ -1495,18 +1494,29 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
>   
>   	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
>   	if (ret)
> -		return ret;
> +		goto cancel_work;


Why you need do go to cancel_work_sync(&con->recovery_work) at such 
early stage of device init, is RAS active already by this time and RAS 
interrupt might fire triggering reset  ?

Andrey


>   
>   	if (adev->psp.ras.ras->eeprom_control.num_recs) {
>   		ret = amdgpu_ras_load_bad_pages(adev);
>   		if (ret)
> -			return ret;
> +			goto cancel_work;
>   		ret = amdgpu_ras_reserve_bad_pages(adev);
>   		if (ret)
> -			return ret;
> +			goto release;
>   	}
>   
>   	return 0;
> +
> +release:
> +	amdgpu_ras_release_bad_pages(adev);
> +cancel_work:
> +	cancel_work_sync(&con->recovery_work);
> +	con->eh_data = NULL;
> +	kfree((*data)->bps);
> +	kfree((*data)->bps_bo);
> +	kfree(*data);
> +
> +	return ret;
>   }
>   
>   static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
> @@ -1520,6 +1530,7 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
>   	mutex_lock(&con->recovery_lock);
>   	con->eh_data = NULL;
>   	kfree(data->bps);
> +	kfree(data->bps_bo);
>   	kfree(data);
>   	mutex_unlock(&con->recovery_lock);
>   
> @@ -1611,9 +1622,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>   			return r;
>   	}
>   
> -	if (amdgpu_ras_recovery_init(adev))
> -		goto recovery_out;
> -
>   	amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
>   
>   	if (amdgpu_ras_fs_init(adev))
> @@ -1628,8 +1636,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>   			con->hw_supported, con->supported);
>   	return 0;
>   fs_out:
> -	amdgpu_ras_recovery_fini(adev);
> -recovery_out:
>   	amdgpu_ras_set_context(adev, NULL);
>   	kfree(con);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 42e1d379e21c..6d00f79b612b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
>   	return ras && (ras->supported & (1 << block));
>   }
>   
> +int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
>   int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
>   		unsigned int block);
>   
> @@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,
>   {
>   	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>   
> +	/* save bad page to eeprom before gpu reset,
> +	 * i2c may be unstable in gpu reset
> +	 */
> +	amdgpu_ras_reserve_bad_pages(adev);
>   	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
>   		schedule_work(&ras->recovery_work);
>   	return 0;
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place
       [not found] ` <20190830122453.19703-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
@ 2019-08-30 12:24   ` Tao Zhou
       [not found]     ` <20190830122453.19703-5-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Tao Zhou @ 2019-08-30 12:24 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	andrey.grodzovsky-5C7GfCeVMHo, guchun.chen-5C7GfCeVMHo,
	dennis.li-5C7GfCeVMHo, hawking.zhang-5C7GfCeVMHo
  Cc: Tao Zhou

ras recovery_init should be called after ttm and smu init,
bad page reserve should be put in front of gpu reset since i2c
may be unstable during gpu reset
add cleanup for recovery_init and recovery_fini

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 28 +++++++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  5 ++++
 3 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 67b09cb2a9e2..4e4895ac926d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2894,6 +2894,17 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 		goto failed;
 	}
 
+	/* retired pages will be loaded from eeprom and reserved here,
+	 * new bo may be created, it should be called after ttm init,
+	 * accessing eeprom also relies on smu (unlock i2c bus) and it
+	 * should be called after smu init
+	 *
+	 * Note: theoretically, this should be called before all vram allocations
+	 * to protect retired page from abusing, but there are some allocations
+	 * in front of smu fw loading
+	 */
+	amdgpu_ras_recovery_init(adev);
+
 	/* must succeed. */
 	amdgpu_ras_resume(adev);
 
@@ -3627,11 +3638,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 						break;
 				}
 			}
-
-			list_for_each_entry(tmp_adev, device_list_handle,
-					gmc.xgmi.head) {
-				amdgpu_ras_reserve_bad_pages(tmp_adev);
-			}
 		}
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 02120aa3cb5d..ad67a109122f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1477,14 +1477,13 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
 	return 0;
 }
 
-static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data **data = &con->eh_data;
 	int ret;
 
-	*data = kmalloc(sizeof(**data),
-			GFP_KERNEL|__GFP_ZERO);
+	*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
 	if (!*data)
 		return -ENOMEM;
 
@@ -1495,18 +1494,29 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 
 	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
 	if (ret)
-		return ret;
+		goto cancel_work;
 
 	if (adev->psp.ras.ras->eeprom_control.num_recs) {
 		ret = amdgpu_ras_load_bad_pages(adev);
 		if (ret)
-			return ret;
+			goto cancel_work;
 		ret = amdgpu_ras_reserve_bad_pages(adev);
 		if (ret)
-			return ret;
+			goto release;
 	}
 
 	return 0;
+
+release:
+	amdgpu_ras_release_bad_pages(adev);
+cancel_work:
+	cancel_work_sync(&con->recovery_work);
+	con->eh_data = NULL;
+	kfree((*data)->bps);
+	kfree((*data)->bps_bo);
+	kfree(*data);
+
+	return ret;
 }
 
 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
@@ -1520,6 +1530,7 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
 	mutex_lock(&con->recovery_lock);
 	con->eh_data = NULL;
 	kfree(data->bps);
+	kfree(data->bps_bo);
 	kfree(data);
 	mutex_unlock(&con->recovery_lock);
 
@@ -1611,9 +1622,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 			return r;
 	}
 
-	if (amdgpu_ras_recovery_init(adev))
-		goto recovery_out;
-
 	amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
 
 	if (amdgpu_ras_fs_init(adev))
@@ -1628,8 +1636,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 			con->hw_supported, con->supported);
 	return 0;
 fs_out:
-	amdgpu_ras_recovery_fini(adev);
-recovery_out:
 	amdgpu_ras_set_context(adev, NULL);
 	kfree(con);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 42e1d379e21c..6d00f79b612b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
 	return ras && (ras->supported & (1 << block));
 }
 
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
 		unsigned int block);
 
@@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,
 {
 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
+	/* save bad page to eeprom before gpu reset,
+	 * i2c may be unstable in gpu reset
+	 */
+	amdgpu_ras_reserve_bad_pages(adev);
 	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
 		schedule_work(&ras->recovery_work);
 	return 0;
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2019-09-05  7:56 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-09-05  4:03 [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure Zhou1, Tao
     [not found] ` <20190905040324.18741-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
2019-09-05  4:03   ` [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS Zhou1, Tao
2019-09-05  4:03   ` [PATCH 3/4] drm/amdgpu: save umc error records Zhou1, Tao
2019-09-05  4:04   ` [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place Zhou1, Tao
     [not found]     ` <20190905040324.18741-4-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
2019-09-05  7:56       ` Chen, Guchun
  -- strict thread matches above, loose matches on Subject: below --
2019-08-30 12:24 [PATCH 0/4] add support for ras page retirement Tao Zhou
     [not found] ` <20190830122453.19703-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
2019-08-30 12:24   ` [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place Tao Zhou
     [not found]     ` <20190830122453.19703-5-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
2019-08-30 14:03       ` Grodzovsky, Andrey
     [not found]         ` <d70f5672-2d8e-8efe-7b08-9df1c87f98ba-5C7GfCeVMHo@public.gmane.org>
2019-09-02  2:58           ` Zhou1, Tao

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.