All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] add support for ras page retirement
@ 2019-08-30 12:24 Tao Zhou
       [not found] ` <20190830122453.19703-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Tao Zhou @ 2019-08-30 12:24 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	andrey.grodzovsky-5C7GfCeVMHo, guchun.chen-5C7GfCeVMHo,
	dennis.li-5C7GfCeVMHo, hawking.zhang-5C7GfCeVMHo
  Cc: Tao Zhou

This series saves umc error page info into a record structure and stores
records to eeprom, it also loads error records from eeprom and reservers
related retired pages during gpu init.


Tao Zhou (4):
  drm/amdgpu: change ras bps type to eeprom table record structure
  drm/amdgpu: Hook EEPROM table to RAS
  drm/amdgpu: save umc error records
  drm/amdgpu: move the call of ras recovery_init and bad page reserve to
    proper place

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  16 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 170 +++++++++++++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  18 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  29 +++-
 drivers/gpu/drm/amd/amdgpu/umc_v6_1.c      |  39 ++++-
 5 files changed, 202 insertions(+), 70 deletions(-)

-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread
* [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure
@ 2019-09-05  4:03 Zhou1, Tao
       [not found] ` <20190905040324.18741-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Zhou1, Tao @ 2019-09-05  4:03 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Grodzovsky, Andrey,
	Chen, Guchun, Li, Dennis, Zhang, Hawking, Clements, John
  Cc: Zhou1, Tao

change bps type from retired page to eeprom table record, prepare for
saving umc error records to eeprom

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Guchun Chen <guchun.chen@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 59 ++++++++++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 11 +++--
 2 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5c2276bb8325..c6f4c01b98a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1203,14 +1203,14 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
 
 	for (; i < data->count; i++) {
 		(*bps)[i] = (struct ras_badpage){
-			.bp = data->bps[i].bp,
+			.bp = data->bps[i].retired_page,
 			.size = AMDGPU_GPU_PAGE_SIZE,
 			.flags = 0,
 		};
 
 		if (data->last_reserved <= i)
 			(*bps)[i].flags = 1;
-		else if (data->bps[i].bo == NULL)
+		else if (data->bps_bo[i] == NULL)
 			(*bps)[i].flags = 2;
 	}
 
@@ -1304,30 +1304,40 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
 {
 	unsigned int old_space = data->count + data->space_left;
 	unsigned int new_space = old_space + pages;
-	unsigned int align_space = ALIGN(new_space, 1024);
-	void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
-
-	if (!tmp)
+	unsigned int align_space = ALIGN(new_space, 512);
+	void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
+	struct amdgpu_bo **bps_bo =
+			kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL);
+
+	if (!bps || !bps_bo) {
+		kfree(bps);
+		kfree(bps_bo);
 		return -ENOMEM;
+	}
 
 	if (data->bps) {
-		memcpy(tmp, data->bps,
+		memcpy(bps, data->bps,
 				data->count * sizeof(*data->bps));
 		kfree(data->bps);
 	}
+	if (data->bps_bo) {
+		memcpy(bps_bo, data->bps_bo,
+				data->count * sizeof(*data->bps_bo));
+		kfree(data->bps_bo);
+	}
 
-	data->bps = tmp;
+	data->bps = bps;
+	data->bps_bo = bps_bo;
 	data->space_left += align_space - old_space;
 	return 0;
 }
 
 /* it deal with vram only. */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
-		unsigned long *bps, int pages)
+		struct eeprom_table_record *bps, int pages)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data *data;
-	int i = pages;
 	int ret = 0;
 
 	if (!con || !con->eh_data || !bps || pages <= 0)
@@ -1344,10 +1354,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 			goto out;
 		}
 
-	while (i--)
-		data->bps[data->count++].bp = bps[i];
-
+	memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
+	data->count += pages;
 	data->space_left -= pages;
+
 out:
 	mutex_unlock(&con->recovery_lock);
 
@@ -1372,13 +1382,13 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 		goto out;
 	/* reserve vram at driver post stage. */
 	for (i = data->last_reserved; i < data->count; i++) {
-		bp = data->bps[i].bp;
+		bp = data->bps[i].retired_page;
 
 		if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
 					PAGE_SIZE, &bo))
 			DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
 
-		data->bps[i].bo = bo;
+		data->bps_bo[i] = bo;
 		data->last_reserved = i + 1;
 	}
 out:
@@ -1403,11 +1413,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
 		goto out;
 
 	for (i = data->last_reserved - 1; i >= 0; i--) {
-		bo = data->bps[i].bo;
+		bo = data->bps_bo[i];
 
 		amdgpu_ras_release_vram(adev, &bo);
 
-		data->bps[i].bo = bo;
+		data->bps_bo[i] = bo;
 		data->last_reserved = i;
 	}
 out:
@@ -1423,12 +1433,19 @@ static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
 	return 0;
 }
 
+/*
+ * read error record array in eeprom and reserve enough space for
+ * storing new bad pages
+ */
 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
 {
-	/* TODO
-	 * read the array to eeprom when SMU disabled.
-	 */
-	return 0;
+	struct eeprom_table_record *bps = NULL;
+	int ret;
+
+	ret = amdgpu_ras_add_bad_pages(adev, bps,
+				adev->umc.max_ras_err_cnt_per_query);
+
+	return ret;
 }
 
 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index f487038ba331..bc1d45971607 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -351,11 +351,10 @@ struct ras_err_data {
 };
 
 struct ras_err_handler_data {
-	/* point to bad pages array */
-	struct {
-		unsigned long bp;
-		struct amdgpu_bo *bo;
-	} *bps;
+	/* point to bad page records array */
+	struct eeprom_table_record *bps;
+	/* point to reserved bo array */
+	struct amdgpu_bo **bps_bo;
 	/* the count of entries */
 	int count;
 	/* the space can place new entries */
@@ -492,7 +491,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
-		unsigned long *bps, int pages);
+		struct eeprom_table_record *bps, int pages);
 
 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev);
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2019-09-05  7:56 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-30 12:24 [PATCH 0/4] add support for ras page retirement Tao Zhou
     [not found] ` <20190830122453.19703-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
2019-08-30 12:24   ` [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure Tao Zhou
     [not found]     ` <20190830122453.19703-2-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
2019-09-02  2:13       ` Chen, Guchun
     [not found]         ` <SN6PR12MB2813C145D6004891FAF398B5F1BE0-kxOKjb6HO/Hw8A9fYknAbAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-09-02  3:14           ` Zhou1, Tao
2019-08-30 12:24   ` [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS Tao Zhou
     [not found]     ` <20190830122453.19703-3-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
2019-09-02  2:11       ` Chen, Guchun
     [not found]         ` <SN6PR12MB2813A05D3E8BCC723AE50308F1BE0-kxOKjb6HO/Hw8A9fYknAbAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-09-02  3:00           ` Zhou1, Tao
2019-08-30 12:24   ` [PATCH 3/4] drm/amdgpu: save umc error records Tao Zhou
2019-08-30 12:24   ` [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place Tao Zhou
     [not found]     ` <20190830122453.19703-5-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
2019-08-30 14:03       ` Grodzovsky, Andrey
     [not found]         ` <d70f5672-2d8e-8efe-7b08-9df1c87f98ba-5C7GfCeVMHo@public.gmane.org>
2019-09-02  2:58           ` Zhou1, Tao
2019-09-02  2:25   ` [PATCH 0/4] add support for ras page retirement Chen, Guchun
2019-09-05  4:03 [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure Zhou1, Tao
     [not found] ` <20190905040324.18741-1-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
2019-09-05  4:04   ` [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place Zhou1, Tao
     [not found]     ` <20190905040324.18741-4-tao.zhou1-5C7GfCeVMHo@public.gmane.org>
2019-09-05  7:56       ` Chen, Guchun

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.