All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] drm/amdgpu: check if vram is lost v2
@ 2017-05-16  9:25 Chunming Zhou
       [not found] ` <1494926750-1081-1-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 22+ messages in thread
From: Chunming Zhou @ 2017-05-16  9:25 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Chunming Zhou

bakup first 64 byte of gart table as reset magic, check if magic is same
after gpu hw reset.
v2: use memcmp instead of manual innovation.

Change-Id: I9a73720da4084ea8677c3031dfb62e8157ee5704
Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index de08ff0..f9da215 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1502,6 +1502,7 @@ struct amdgpu_ssg {
 #endif
 };
 
+#define AMDGPU_RESET_MAGIC_NUM 64
 struct amdgpu_device {
 	struct device			*dev;
 	struct drm_device		*ddev;
@@ -1705,6 +1706,7 @@ struct amdgpu_device {
 
 	/* record hw reset is performed */
 	bool has_hw_reset;
+	u8				reset_magic[AMDGPU_RESET_MAGIC_NUM];
 
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0a31fb1..c56ae4a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1685,6 +1685,17 @@ static int amdgpu_init(struct amdgpu_device *adev)
 	return 0;
 }
 
+static void amdgpu_fill_reset_magic(struct amdgpu_device *adev)
+{
+	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
+}
+
+static bool amdgpu_check_vram_lost(struct amdgpu_device *adev)
+{
+	return !!memcmp(adev->gart.ptr, adev->reset_magic,
+			AMDGPU_RESET_MAGIC_NUM);
+}
+
 static int amdgpu_late_init(struct amdgpu_device *adev)
 {
 	int i = 0, r;
@@ -1715,6 +1726,8 @@ static int amdgpu_late_init(struct amdgpu_device *adev)
 		}
 	}
 
+	amdgpu_fill_reset_magic(adev);
+
 	return 0;
 }
 
@@ -2830,7 +2843,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
 	struct drm_atomic_state *state = NULL;
 	int i, r;
 	int resched;
-	bool need_full_reset;
+	bool need_full_reset, vram_lost = false;
 
 	if (amdgpu_sriov_vf(adev))
 		return amdgpu_sriov_gpu_reset(adev, true);
@@ -2899,12 +2912,17 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
 			r = amdgpu_resume_phase1(adev);
 			if (r)
 				goto out;
+			vram_lost = amdgpu_check_vram_lost(adev);
+			if (vram_lost)
+				DRM_ERROR("VRAM is lost!\n");
 			r = amdgpu_ttm_recover_gart(adev);
 			if (r)
 				goto out;
 			r = amdgpu_resume_phase2(adev);
 			if (r)
 				goto out;
+			if (vram_lost)
+				amdgpu_fill_reset_magic(adev);
 		}
 	}
 out:
-- 
1.9.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when vram is lost v2
       [not found] ` <1494926750-1081-1-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>
@ 2017-05-16  9:25   ` Chunming Zhou
       [not found]     ` <1494926750-1081-2-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>
  2017-05-16  9:25   ` [PATCH 3/4] drm/amdgpu: skip all jobs of guilty vm Chunming Zhou
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 22+ messages in thread
From: Chunming Zhou @ 2017-05-16  9:25 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Chunming Zhou

below ioctl will return -ENODEV:
amdgpu_cs_ioctl
amdgpu_cs_wait_ioctl
amdgpu_cs_wait_fences_ioctl
amdgpu_gem_va_ioctl
amdgpu_info_ioctl

v2: only for map and replace cases in amdgpu_gem_va_ioctl

Change-Id: I8970cde3301b7cfeb4263cc0f0e54aece215c98e
Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  9 +++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    |  5 +++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    | 10 ++++++++++
 5 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f9da215..dcd6203 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -855,6 +855,7 @@ struct amdgpu_fpriv {
 	struct amdgpu_ctx_mgr	ctx_mgr;
 	spinlock_t		sem_handles_lock;
 	struct idr		sem_handles;
+	u32			vram_lost_counter;
 };
 
 /*
@@ -1607,6 +1608,7 @@ struct amdgpu_device {
 	atomic64_t			num_bytes_moved;
 	atomic64_t			num_evictions;
 	atomic_t			gpu_reset_counter;
+	atomic_t			vram_lost_counter;
 
 	/* data for buffer migration throttling */
 	struct {
@@ -2005,6 +2007,8 @@ static inline void amdgpu_unregister_atpx_handler(void) {}
 extern const struct drm_ioctl_desc amdgpu_ioctls_kms[];
 extern const int amdgpu_max_kms_ioctl;
 
+bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
+			  struct amdgpu_fpriv *fpriv);
 int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags);
 int amdgpu_driver_unload_kms(struct drm_device *dev);
 void amdgpu_driver_lastclose_kms(struct drm_device *dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index b803412..911aa02 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1097,6 +1097,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 {
 	struct amdgpu_device *adev = dev->dev_private;
+	struct amdgpu_fpriv *fpriv = filp->driver_priv;
 	union drm_amdgpu_cs *cs = data;
 	struct amdgpu_cs_parser parser = {};
 	bool reserved_buffers = false;
@@ -1104,6 +1105,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 
 	if (!adev->accel_working)
 		return -EBUSY;
+	if (amdgpu_kms_vram_lost(adev, fpriv))
+		return -ENODEV;
 
 	parser.adev = adev;
 	parser.filp = filp;
@@ -1165,12 +1168,15 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data,
 {
 	union drm_amdgpu_wait_cs *wait = data;
 	struct amdgpu_device *adev = dev->dev_private;
+	struct amdgpu_fpriv *fpriv = filp->driver_priv;
 	unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout);
 	struct amdgpu_ring *ring = NULL;
 	struct amdgpu_ctx *ctx;
 	struct fence *fence;
 	long r;
 
+	if (amdgpu_kms_vram_lost(adev, fpriv))
+		return -ENODEV;
 	r = amdgpu_cs_get_ring(adev, wait->in.ip_type, wait->in.ip_instance,
 			       wait->in.ring, &ring);
 	if (r)
@@ -1344,12 +1350,15 @@ int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data,
 				struct drm_file *filp)
 {
 	struct amdgpu_device *adev = dev->dev_private;
+	struct amdgpu_fpriv *fpriv = filp->driver_priv;
 	union drm_amdgpu_wait_fences *wait = data;
 	uint32_t fence_count = wait->in.fence_count;
 	struct drm_amdgpu_fence *fences_user;
 	struct drm_amdgpu_fence *fences;
 	int r;
 
+	if (amdgpu_kms_vram_lost(adev, fpriv))
+		return -ENODEV;
 	/* Get the fences from userspace */
 	fences = kmalloc_array(fence_count, sizeof(struct drm_amdgpu_fence),
 			GFP_KERNEL);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index c56ae4a..2f0fcf8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2913,8 +2913,10 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
 			if (r)
 				goto out;
 			vram_lost = amdgpu_check_vram_lost(adev);
-			if (vram_lost)
+			if (vram_lost) {
 				DRM_ERROR("VRAM is lost!\n");
+				atomic_inc(&adev->vram_lost_counter);
+			}
 			r = amdgpu_ttm_recover_gart(adev);
 			if (r)
 				goto out;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index d8275ef..83bc94c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -802,6 +802,11 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
 			args->operation);
 		return -EINVAL;
 	}
+	if ((args->operation == AMDGPU_VA_OP_MAP) ||
+	    (args->operation == AMDGPU_VA_OP_REPLACE)) {
+		if (amdgpu_kms_vram_lost(adev, fpriv))
+			return -ENODEV;
+	}
 
 	INIT_LIST_HEAD(&list);
 	if ((args->operation != AMDGPU_VA_OP_CLEAR) &&
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 368829a..a231aa1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -235,6 +235,7 @@ static int amdgpu_firmware_info(struct drm_amdgpu_info_firmware *fw_info,
 static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 {
 	struct amdgpu_device *adev = dev->dev_private;
+	struct amdgpu_fpriv *fpriv = filp->driver_priv;
 	struct drm_amdgpu_info *info = data;
 	struct amdgpu_mode_info *minfo = &adev->mode_info;
 	void __user *out = (void __user *)(uintptr_t)info->return_pointer;
@@ -247,6 +248,8 @@ static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file
 
 	if (!info->return_size || !info->return_pointer)
 		return -EINVAL;
+	if (amdgpu_kms_vram_lost(adev, fpriv))
+		return -ENODEV;
 
 	switch (info->query) {
 	case AMDGPU_INFO_VIRTUAL_RANGE: {
@@ -779,6 +782,12 @@ void amdgpu_driver_lastclose_kms(struct drm_device *dev)
 	vga_switcheroo_process_delayed_switch();
 }
 
+bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
+			  struct amdgpu_fpriv *fpriv)
+{
+	return fpriv->vram_lost_counter != atomic_read(&adev->vram_lost_counter);
+}
+
 /**
  * amdgpu_driver_open_kms - drm callback for open
  *
@@ -833,6 +842,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
 
 	amdgpu_ctx_mgr_init(&fpriv->ctx_mgr);
 
+	fpriv->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
 	file_priv->driver_priv = fpriv;
 
 out_suspend:
-- 
1.9.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 3/4] drm/amdgpu: skip all jobs of guilty vm
       [not found] ` <1494926750-1081-1-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>
  2017-05-16  9:25   ` [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when " Chunming Zhou
@ 2017-05-16  9:25   ` Chunming Zhou
  2017-05-16  9:25   ` [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter Chunming Zhou
  2017-05-16 10:49   ` [PATCH 1/4] drm/amdgpu: check if vram is lost v2 Christian König
  3 siblings, 0 replies; 22+ messages in thread
From: Chunming Zhou @ 2017-05-16  9:25 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Chunming Zhou

Change-Id: Ib305282a21d37d6872afe4c1ce63d53b6517f338
Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 4a5ffa5..da2f87e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -167,6 +167,7 @@ static struct fence *amdgpu_job_run(struct amd_sched_job *sched_job)
 {
 	struct fence *fence = NULL;
 	struct amdgpu_job *job;
+	struct amdgpu_fpriv *fpriv = NULL;
 	int r;
 
 	if (!sched_job) {
@@ -178,10 +179,16 @@ static struct fence *amdgpu_job_run(struct amd_sched_job *sched_job)
 	BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
 
 	trace_amdgpu_sched_run_job(job);
-	r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, &fence);
-	if (r)
-		DRM_ERROR("Error scheduling IBs (%d)\n", r);
-
+	if (job->vm)
+		fpriv = container_of(job->vm, struct amdgpu_fpriv, vm);
+	/* skip ib schedule when vram is lost */
+	if (fpriv && amdgpu_kms_vram_lost(job->adev, fpriv))
+		DRM_ERROR("Skip scheduling IBs!\n");
+	else {
+		r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, &fence);
+		if (r)
+			DRM_ERROR("Error scheduling IBs (%d)\n", r);
+	}
 	/* if gpu reset, hw fence will be replaced here */
 	fence_put(job->fence);
 	job->fence = fence_get(fence);
-- 
1.9.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter
       [not found] ` <1494926750-1081-1-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>
  2017-05-16  9:25   ` [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when " Chunming Zhou
  2017-05-16  9:25   ` [PATCH 3/4] drm/amdgpu: skip all jobs of guilty vm Chunming Zhou
@ 2017-05-16  9:25   ` Chunming Zhou
       [not found]     ` <1494926750-1081-4-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>
  2017-05-16 10:49   ` [PATCH 1/4] drm/amdgpu: check if vram is lost v2 Christian König
  3 siblings, 1 reply; 22+ messages in thread
From: Chunming Zhou @ 2017-05-16  9:25 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Chunming Zhou

Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977
Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 +++
 include/uapi/drm/amdgpu_drm.h          | 1 +
 2 files changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index bca1fb5..f3e7525 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 	case AMDGPU_VM_OP_UNRESERVE_VMID:
 		amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, AMDGPU_GFXHUB);
 		break;
+	case AMDGPU_VM_OP_RESET:
+		fpriv->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
+		break;
 	default:
 		return -EINVAL;
 	}
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 81141e1..ed2ef9b 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -246,6 +246,7 @@ struct drm_amdgpu_sem_in {
 /* vm ioctl */
 #define AMDGPU_VM_OP_RESERVE_VMID	1
 #define AMDGPU_VM_OP_UNRESERVE_VMID	2
+#define AMDGPU_VM_OP_RESET		3
 
 struct drm_amdgpu_vm_in {
 	/** AMDGPU_VM_OP_* */
-- 
1.9.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/4] drm/amdgpu: check if vram is lost v2
       [not found] ` <1494926750-1081-1-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>
                     ` (2 preceding siblings ...)
  2017-05-16  9:25   ` [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter Chunming Zhou
@ 2017-05-16 10:49   ` Christian König
       [not found]     ` <0c1d89c5-65ea-cdad-100a-80d0377b865c-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  3 siblings, 1 reply; 22+ messages in thread
From: Christian König @ 2017-05-16 10:49 UTC (permalink / raw)
  To: Chunming Zhou, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 16.05.2017 um 11:25 schrieb Chunming Zhou:
> bakup first 64 byte of gart table as reset magic, check if magic is same
> after gpu hw reset.
> v2: use memcmp instead of manual innovation.
>
> Change-Id: I9a73720da4084ea8677c3031dfb62e8157ee5704
> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>

Patch #1-#3 are Reviewed-by: Christian König <christian.koenig@amd.com>

Patch #4:

You need to add the new enum on line 591 or otherwise you will get an 
"unsupported operation" error.

Line 604 should be changed as well or otherwise we need a BO for this 
operation.

A libdrm test case to just call this IOCTL would probably be a good idea.

Additional to that I would ping Marek (Mesa) and Michel (DDX) for their 
opinion on this. Could be that this is completely superfluous and the 
UMDs needs something else.

Regards,
Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++++++++-
>   2 files changed, 21 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index de08ff0..f9da215 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1502,6 +1502,7 @@ struct amdgpu_ssg {
>   #endif
>   };
>   
> +#define AMDGPU_RESET_MAGIC_NUM 64
>   struct amdgpu_device {
>   	struct device			*dev;
>   	struct drm_device		*ddev;
> @@ -1705,6 +1706,7 @@ struct amdgpu_device {
>   
>   	/* record hw reset is performed */
>   	bool has_hw_reset;
> +	u8				reset_magic[AMDGPU_RESET_MAGIC_NUM];
>   
>   };
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 0a31fb1..c56ae4a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1685,6 +1685,17 @@ static int amdgpu_init(struct amdgpu_device *adev)
>   	return 0;
>   }
>   
> +static void amdgpu_fill_reset_magic(struct amdgpu_device *adev)
> +{
> +	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
> +}
> +
> +static bool amdgpu_check_vram_lost(struct amdgpu_device *adev)
> +{
> +	return !!memcmp(adev->gart.ptr, adev->reset_magic,
> +			AMDGPU_RESET_MAGIC_NUM);
> +}
> +
>   static int amdgpu_late_init(struct amdgpu_device *adev)
>   {
>   	int i = 0, r;
> @@ -1715,6 +1726,8 @@ static int amdgpu_late_init(struct amdgpu_device *adev)
>   		}
>   	}
>   
> +	amdgpu_fill_reset_magic(adev);
> +
>   	return 0;
>   }
>   
> @@ -2830,7 +2843,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   	struct drm_atomic_state *state = NULL;
>   	int i, r;
>   	int resched;
> -	bool need_full_reset;
> +	bool need_full_reset, vram_lost = false;
>   
>   	if (amdgpu_sriov_vf(adev))
>   		return amdgpu_sriov_gpu_reset(adev, true);
> @@ -2899,12 +2912,17 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   			r = amdgpu_resume_phase1(adev);
>   			if (r)
>   				goto out;
> +			vram_lost = amdgpu_check_vram_lost(adev);
> +			if (vram_lost)
> +				DRM_ERROR("VRAM is lost!\n");
>   			r = amdgpu_ttm_recover_gart(adev);
>   			if (r)
>   				goto out;
>   			r = amdgpu_resume_phase2(adev);
>   			if (r)
>   				goto out;
> +			if (vram_lost)
> +				amdgpu_fill_reset_magic(adev);
>   		}
>   	}
>   out:


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter
       [not found]     ` <1494926750-1081-4-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>
@ 2017-05-17  1:18       ` Michel Dänzer
       [not found]         ` <58988726-543a-535a-3011-860d29b9f2da-otUistvHUpPR7s880joybQ@public.gmane.org>
  0 siblings, 1 reply; 22+ messages in thread
From: Michel Dänzer @ 2017-05-17  1:18 UTC (permalink / raw)
  To: Chunming Zhou; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 16/05/17 06:25 PM, Chunming Zhou wrote:
> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977
> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 +++
>  include/uapi/drm/amdgpu_drm.h          | 1 +
>  2 files changed, 4 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index bca1fb5..f3e7525 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>  	case AMDGPU_VM_OP_UNRESERVE_VMID:
>  		amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, AMDGPU_GFXHUB);
>  		break;
> +	case AMDGPU_VM_OP_RESET:
> +		fpriv->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
> +		break;

How do you envision the UMDs using this? I can mostly think of them
calling this ioctl when a context is created or destroyed. But that
would also allow any other remaining contexts using the same DRM file
descriptor to use all ioctls again. So, I think there needs to be a
vram_lost_counter in struct amdgpu_ctx instead of in struct amdgpu_fpriv.

But then I'm not sure this ioctl will be useful... I guess the UMD could
try re-uploading the contents of crucial BOs (shader code, resource
descriptors etc.) for an existing context and then call this ioctl. How
about the GPUVM page tables? Will the kernel driver automatically
re-generate those as needed, or will the UMD also need to e.g. destroy
and re-create the VM mappings for all BOs before calling this ioctl?

It's hard to be sure whether that's workable for the UMD without at
least a working prototype...


-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter
       [not found]         ` <58988726-543a-535a-3011-860d29b9f2da-otUistvHUpPR7s880joybQ@public.gmane.org>
@ 2017-05-17  3:04           ` zhoucm1
       [not found]             ` <591BBDA2.1070900-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 22+ messages in thread
From: zhoucm1 @ 2017-05-17  3:04 UTC (permalink / raw)
  To: Michel Dänzer; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



On 2017年05月17日 09:18, Michel Dänzer wrote:
> On 16/05/17 06:25 PM, Chunming Zhou wrote:
>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977
>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 +++
>>   include/uapi/drm/amdgpu_drm.h          | 1 +
>>   2 files changed, 4 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index bca1fb5..f3e7525 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>>   	case AMDGPU_VM_OP_UNRESERVE_VMID:
>>   		amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, AMDGPU_GFXHUB);
>>   		break;
>> +	case AMDGPU_VM_OP_RESET:
>> +		fpriv->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>> +		break;
> How do you envision the UMDs using this? I can mostly think of them
> calling this ioctl when a context is created or destroyed. But that
> would also allow any other remaining contexts using the same DRM file
> descriptor to use all ioctls again. So, I think there needs to be a
> vram_lost_counter in struct amdgpu_ctx instead of in struct amdgpu_fpriv.
struct amdgpu_fpriv for vram_lost_counter is proper place, especially 
for ioctl return value.
if you need to reset ctx one by one, we can mark all contexts of that 
vm, and then reset by userspace.
>
> But then I'm not sure this ioctl will be useful... I guess the UMD could
> try re-uploading the contents of crucial BOs (shader code, resource
> descriptors etc.) for an existing context and then call this ioctl. How
> about the GPUVM page tables? Will the kernel driver automatically
> re-generate those as needed, or will the UMD also need to e.g. destroy
> and re-create the VM mappings for all BOs before calling this ioctl?
VM page table will recover by vm shadow BOs.

>
> It's hard to be sure whether that's workable for the UMD without at
> least a working prototype...
Totally agree, if you can help to do this in userspace, I'd like to 
support you from kernel side, or Christian.

Thanks,
David Zhou
>
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter
       [not found]             ` <591BBDA2.1070900-5C7GfCeVMHo@public.gmane.org>
@ 2017-05-17  3:15               ` Michel Dänzer
       [not found]                 ` <29fe2142-7fd1-e23a-49d9-c38dc685db92-otUistvHUpPR7s880joybQ@public.gmane.org>
  0 siblings, 1 reply; 22+ messages in thread
From: Michel Dänzer @ 2017-05-17  3:15 UTC (permalink / raw)
  To: zhoucm1; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 17/05/17 12:04 PM, zhoucm1 wrote:
> On 2017年05月17日 09:18, Michel Dänzer wrote:
>> On 16/05/17 06:25 PM, Chunming Zhou wrote:
>>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977
>>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 +++
>>>   include/uapi/drm/amdgpu_drm.h          | 1 +
>>>   2 files changed, 4 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index bca1fb5..f3e7525 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev,
>>> void *data, struct drm_file *filp)
>>>       case AMDGPU_VM_OP_UNRESERVE_VMID:
>>>           amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, AMDGPU_GFXHUB);
>>>           break;
>>> +    case AMDGPU_VM_OP_RESET:
>>> +        fpriv->vram_lost_counter =
>>> atomic_read(&adev->vram_lost_counter);
>>> +        break;
>> How do you envision the UMDs using this? I can mostly think of them
>> calling this ioctl when a context is created or destroyed. But that
>> would also allow any other remaining contexts using the same DRM file
>> descriptor to use all ioctls again. So, I think there needs to be a
>> vram_lost_counter in struct amdgpu_ctx instead of in struct amdgpu_fpriv.
> struct amdgpu_fpriv for vram_lost_counter is proper place, especially
> for ioctl return value.
> if you need to reset ctx one by one, we can mark all contexts of that
> vm, and then reset by userspace.

I'm not following. With vram_lost_counter in amdgpu_fpriv, if any
context calls this ioctl, all other contexts using the same file
descriptor will also be considered safe again, right? So I'm still not
sure how this is supposed to be used by the UMDs. Can you describe your
idea for that?


>> It's hard to be sure whether that's workable for the UMD without at
>> least a working prototype...
> Totally agree, if you can help to do this in userspace, I'd like to
> support you from kernel side, or Christian.

I'm busy with other stuff.


-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter
       [not found]                 ` <29fe2142-7fd1-e23a-49d9-c38dc685db92-otUistvHUpPR7s880joybQ@public.gmane.org>
@ 2017-05-17  4:28                   ` zhoucm1
       [not found]                     ` <591BD17C.8050903-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 22+ messages in thread
From: zhoucm1 @ 2017-05-17  4:28 UTC (permalink / raw)
  To: Michel Dänzer; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



On 2017年05月17日 11:15, Michel Dänzer wrote:
> On 17/05/17 12:04 PM, zhoucm1 wrote:
>> On 2017年05月17日 09:18, Michel Dänzer wrote:
>>> On 16/05/17 06:25 PM, Chunming Zhou wrote:
>>>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977
>>>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 +++
>>>>    include/uapi/drm/amdgpu_drm.h          | 1 +
>>>>    2 files changed, 4 insertions(+)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> index bca1fb5..f3e7525 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev,
>>>> void *data, struct drm_file *filp)
>>>>        case AMDGPU_VM_OP_UNRESERVE_VMID:
>>>>            amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, AMDGPU_GFXHUB);
>>>>            break;
>>>> +    case AMDGPU_VM_OP_RESET:
>>>> +        fpriv->vram_lost_counter =
>>>> atomic_read(&adev->vram_lost_counter);
>>>> +        break;
>>> How do you envision the UMDs using this? I can mostly think of them
>>> calling this ioctl when a context is created or destroyed. But that
>>> would also allow any other remaining contexts using the same DRM file
>>> descriptor to use all ioctls again. So, I think there needs to be a
>>> vram_lost_counter in struct amdgpu_ctx instead of in struct amdgpu_fpriv.
>> struct amdgpu_fpriv for vram_lost_counter is proper place, especially
>> for ioctl return value.
>> if you need to reset ctx one by one, we can mark all contexts of that
>> vm, and then reset by userspace.
> I'm not following. With vram_lost_counter in amdgpu_fpriv, if any
> context calls this ioctl, all other contexts using the same file
> descriptor will also be considered safe again, right?
Yes, but it really depends on userspace requirement, if you need to 
reset ctx one by one, we can mark all contexts of that vm to guilty, and 
then reset one context by userspace.
> So I'm still not
> sure how this is supposed to be used by the UMDs. Can you describe your
> idea for that?
Correct first, this idea is picked up from Christian. We just one to 
provide a possibility to handle ENODEV and recover system, rather than 
just system dead when vram is lost.
And how UMDs handle reset? which obviously need to more discussion 
between kernel and userspace.

Regards,
David Zhou
>
>
>>> It's hard to be sure whether that's workable for the UMD without at
>>> least a working prototype...
>> Totally agree, if you can help to do this in userspace, I'd like to
>> support you from kernel side, or Christian.
> I'm busy with other stuff.
>
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/4] drm/amdgpu: check if vram is lost v2
       [not found]     ` <0c1d89c5-65ea-cdad-100a-80d0377b865c-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-05-17  4:37       ` zhoucm1
  0 siblings, 0 replies; 22+ messages in thread
From: zhoucm1 @ 2017-05-17  4:37 UTC (permalink / raw)
  To: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



On 2017年05月16日 18:49, Christian König wrote:
> Am 16.05.2017 um 11:25 schrieb Chunming Zhou:
>> bakup first 64 byte of gart table as reset magic, check if magic is same
>> after gpu hw reset.
>> v2: use memcmp instead of manual innovation.
>>
>> Change-Id: I9a73720da4084ea8677c3031dfb62e8157ee5704
>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
>
> Patch #1-#3 are Reviewed-by: Christian König <christian.koenig@amd.com>
pushed.
>
> Patch #4:
>
> You need to add the new enum on line 591 or otherwise you will get an 
> "unsupported operation" error.
>
> Line 604 should be changed as well or otherwise we need a BO for this 
> operation.
are you sure you are talking this patch#4? I cannot address what you said.

>
> A libdrm test case to just call this IOCTL would probably be a good idea.
>
> Additional to that I would ping Marek (Mesa) and Michel (DDX) for 
> their opinion on this. Could be that this is completely superfluous 
> and the UMDs needs something else.
Michel seems have different opinion/concern, maybe we need more 
discussions before we make new interfaces.

Thanks,
David Zhou
>
> Regards,
> Christian.
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++++++++-
>>   2 files changed, 21 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index de08ff0..f9da215 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1502,6 +1502,7 @@ struct amdgpu_ssg {
>>   #endif
>>   };
>>   +#define AMDGPU_RESET_MAGIC_NUM 64
>>   struct amdgpu_device {
>>       struct device            *dev;
>>       struct drm_device        *ddev;
>> @@ -1705,6 +1706,7 @@ struct amdgpu_device {
>>         /* record hw reset is performed */
>>       bool has_hw_reset;
>> +    u8                reset_magic[AMDGPU_RESET_MAGIC_NUM];
>>     };
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 0a31fb1..c56ae4a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -1685,6 +1685,17 @@ static int amdgpu_init(struct amdgpu_device 
>> *adev)
>>       return 0;
>>   }
>>   +static void amdgpu_fill_reset_magic(struct amdgpu_device *adev)
>> +{
>> +    memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
>> +}
>> +
>> +static bool amdgpu_check_vram_lost(struct amdgpu_device *adev)
>> +{
>> +    return !!memcmp(adev->gart.ptr, adev->reset_magic,
>> +            AMDGPU_RESET_MAGIC_NUM);
>> +}
>> +
>>   static int amdgpu_late_init(struct amdgpu_device *adev)
>>   {
>>       int i = 0, r;
>> @@ -1715,6 +1726,8 @@ static int amdgpu_late_init(struct 
>> amdgpu_device *adev)
>>           }
>>       }
>>   +    amdgpu_fill_reset_magic(adev);
>> +
>>       return 0;
>>   }
>>   @@ -2830,7 +2843,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>       struct drm_atomic_state *state = NULL;
>>       int i, r;
>>       int resched;
>> -    bool need_full_reset;
>> +    bool need_full_reset, vram_lost = false;
>>         if (amdgpu_sriov_vf(adev))
>>           return amdgpu_sriov_gpu_reset(adev, true);
>> @@ -2899,12 +2912,17 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>               r = amdgpu_resume_phase1(adev);
>>               if (r)
>>                   goto out;
>> +            vram_lost = amdgpu_check_vram_lost(adev);
>> +            if (vram_lost)
>> +                DRM_ERROR("VRAM is lost!\n");
>>               r = amdgpu_ttm_recover_gart(adev);
>>               if (r)
>>                   goto out;
>>               r = amdgpu_resume_phase2(adev);
>>               if (r)
>>                   goto out;
>> +            if (vram_lost)
>> +                amdgpu_fill_reset_magic(adev);
>>           }
>>       }
>>   out:
>
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter
       [not found]                     ` <591BD17C.8050903-5C7GfCeVMHo@public.gmane.org>
@ 2017-05-17  6:57                       ` Michel Dänzer
       [not found]                         ` <7d87bc8e-9c09-ad25-de6e-dfbd8116bf6e-otUistvHUpPR7s880joybQ@public.gmane.org>
  0 siblings, 1 reply; 22+ messages in thread
From: Michel Dänzer @ 2017-05-17  6:57 UTC (permalink / raw)
  To: zhoucm1; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 17/05/17 01:28 PM, zhoucm1 wrote:
> On 2017年05月17日 11:15, Michel Dänzer wrote:
>> On 17/05/17 12:04 PM, zhoucm1 wrote:
>>> On 2017年05月17日 09:18, Michel Dänzer wrote:
>>>> On 16/05/17 06:25 PM, Chunming Zhou wrote:
>>>>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977
>>>>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> index bca1fb5..f3e7525 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev,
>>>>> void *data, struct drm_file *filp)
>>>>>        case AMDGPU_VM_OP_UNRESERVE_VMID:
>>>>>            amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm,
>>>>> AMDGPU_GFXHUB);
>>>>>            break;
>>>>> +    case AMDGPU_VM_OP_RESET:
>>>>> +        fpriv->vram_lost_counter =
>>>>> atomic_read(&adev->vram_lost_counter);
>>>>> +        break;
>>>> How do you envision the UMDs using this? I can mostly think of them
>>>> calling this ioctl when a context is created or destroyed. But that
>>>> would also allow any other remaining contexts using the same DRM file
>>>> descriptor to use all ioctls again. So, I think there needs to be a
>>>> vram_lost_counter in struct amdgpu_ctx instead of in struct
>>>> amdgpu_fpriv.
>>> struct amdgpu_fpriv for vram_lost_counter is proper place, especially
>>> for ioctl return value.
>>> if you need to reset ctx one by one, we can mark all contexts of that
>>> vm, and then reset by userspace.
>> I'm not following. With vram_lost_counter in amdgpu_fpriv, if any
>> context calls this ioctl, all other contexts using the same file
>> descriptor will also be considered safe again, right?
> Yes, but it really depends on userspace requirement, if you need to
> reset ctx one by one, we can mark all contexts of that vm to guilty, and
> then reset one context by userspace.

Still not sure what you mean by that.

E.g. what do you mean by "guilty"? I thought that refers to the context
which caused a hang. But it seems like you're using it to refer to any
context which hasn't reacted yet to VRAM contents being lost.

Also not sure what you mean by "if you need to reset ctx one by one".


-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter
       [not found]                         ` <7d87bc8e-9c09-ad25-de6e-dfbd8116bf6e-otUistvHUpPR7s880joybQ@public.gmane.org>
@ 2017-05-17  7:13                           ` zhoucm1
       [not found]                             ` <591BF825.6090505-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 22+ messages in thread
From: zhoucm1 @ 2017-05-17  7:13 UTC (permalink / raw)
  To: Michel Dänzer; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



On 2017年05月17日 14:57, Michel Dänzer wrote:
> On 17/05/17 01:28 PM, zhoucm1 wrote:
>> On 2017年05月17日 11:15, Michel Dänzer wrote:
>>> On 17/05/17 12:04 PM, zhoucm1 wrote:
>>>> On 2017年05月17日 09:18, Michel Dänzer wrote:
>>>>> On 16/05/17 06:25 PM, Chunming Zhou wrote:
>>>>>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977
>>>>>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>> index bca1fb5..f3e7525 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev,
>>>>>> void *data, struct drm_file *filp)
>>>>>>         case AMDGPU_VM_OP_UNRESERVE_VMID:
>>>>>>             amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm,
>>>>>> AMDGPU_GFXHUB);
>>>>>>             break;
>>>>>> +    case AMDGPU_VM_OP_RESET:
>>>>>> +        fpriv->vram_lost_counter =
>>>>>> atomic_read(&adev->vram_lost_counter);
>>>>>> +        break;
>>>>> How do you envision the UMDs using this? I can mostly think of them
>>>>> calling this ioctl when a context is created or destroyed. But that
>>>>> would also allow any other remaining contexts using the same DRM file
>>>>> descriptor to use all ioctls again. So, I think there needs to be a
>>>>> vram_lost_counter in struct amdgpu_ctx instead of in struct
>>>>> amdgpu_fpriv.
>>>> struct amdgpu_fpriv for vram_lost_counter is proper place, especially
>>>> for ioctl return value.
>>>> if you need to reset ctx one by one, we can mark all contexts of that
>>>> vm, and then reset by userspace.
>>> I'm not following. With vram_lost_counter in amdgpu_fpriv, if any
>>> context calls this ioctl, all other contexts using the same file
>>> descriptor will also be considered safe again, right?
>> Yes, but it really depends on userspace requirement, if you need to
>> reset ctx one by one, we can mark all contexts of that vm to guilty, and
>> then reset one context by userspace.
> Still not sure what you mean by that.
>
> E.g. what do you mean by "guilty"? I thought that refers to the context
> which caused a hang. But it seems like you're using it to refer to any
> context which hasn't reacted yet to VRAM contents being lost.
When vram is lost, we treat all contexts need to reset.

Regards,
David Zhou
>
> Also not sure what you mean by "if you need to reset ctx one by one".
>
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter
       [not found]                             ` <591BF825.6090505-5C7GfCeVMHo@public.gmane.org>
@ 2017-05-17  8:01                               ` Michel Dänzer
       [not found]                                 ` <31db7a30-dd98-5cb2-4125-187d3d0e2a49-otUistvHUpPR7s880joybQ@public.gmane.org>
  0 siblings, 1 reply; 22+ messages in thread
From: Michel Dänzer @ 2017-05-17  8:01 UTC (permalink / raw)
  To: zhoucm1; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 17/05/17 04:13 PM, zhoucm1 wrote:
> On 2017年05月17日 14:57, Michel Dänzer wrote:
>> On 17/05/17 01:28 PM, zhoucm1 wrote:
>>> On 2017年05月17日 11:15, Michel Dänzer wrote:
>>>> On 17/05/17 12:04 PM, zhoucm1 wrote:
>>>>> On 2017年05月17日 09:18, Michel Dänzer wrote:
>>>>>> On 16/05/17 06:25 PM, Chunming Zhou wrote:
>>>>>>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977
>>>>>>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>> index bca1fb5..f3e7525 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev,
>>>>>>> void *data, struct drm_file *filp)
>>>>>>>         case AMDGPU_VM_OP_UNRESERVE_VMID:
>>>>>>>             amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm,
>>>>>>> AMDGPU_GFXHUB);
>>>>>>>             break;
>>>>>>> +    case AMDGPU_VM_OP_RESET:
>>>>>>> +        fpriv->vram_lost_counter =
>>>>>>> atomic_read(&adev->vram_lost_counter);
>>>>>>> +        break;
>>>>>> How do you envision the UMDs using this? I can mostly think of them
>>>>>> calling this ioctl when a context is created or destroyed. But that
>>>>>> would also allow any other remaining contexts using the same DRM file
>>>>>> descriptor to use all ioctls again. So, I think there needs to be a
>>>>>> vram_lost_counter in struct amdgpu_ctx instead of in struct
>>>>>> amdgpu_fpriv.
>>>>> struct amdgpu_fpriv for vram_lost_counter is proper place, especially
>>>>> for ioctl return value.
>>>>> if you need to reset ctx one by one, we can mark all contexts of that
>>>>> vm, and then reset by userspace.
>>>> I'm not following. With vram_lost_counter in amdgpu_fpriv, if any
>>>> context calls this ioctl, all other contexts using the same file
>>>> descriptor will also be considered safe again, right?
>>> Yes, but it really depends on userspace requirement, if you need to
>>> reset ctx one by one, we can mark all contexts of that vm to guilty, and
>>> then reset one context by userspace.
>> Still not sure what you mean by that.
>>
>> E.g. what do you mean by "guilty"? I thought that refers to the context
>> which caused a hang. But it seems like you're using it to refer to any
>> context which hasn't reacted yet to VRAM contents being lost.
> When vram is lost, we treat all contexts need to reset.

Essentially, your patches only track VRAM contents being lost per file
descriptor, not per context. I'm not sure (rather skeptical) that this
is suitable for OpenGL UMDs, since state is usually tracked per context.
Marek / Nicolai?


-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter
       [not found]                                 ` <31db7a30-dd98-5cb2-4125-187d3d0e2a49-otUistvHUpPR7s880joybQ@public.gmane.org>
@ 2017-05-17  8:40                                   ` Christian König
       [not found]                                     ` <7a302ebe-1de1-734f-fb21-aadcc7904d37-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 22+ messages in thread
From: Christian König @ 2017-05-17  8:40 UTC (permalink / raw)
  To: Michel Dänzer, zhoucm1; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 17.05.2017 um 10:01 schrieb Michel Dänzer:
> On 17/05/17 04:13 PM, zhoucm1 wrote:
>> On 2017年05月17日 14:57, Michel Dänzer wrote:
>>> On 17/05/17 01:28 PM, zhoucm1 wrote:
>>>> On 2017年05月17日 11:15, Michel Dänzer wrote:
>>>>> On 17/05/17 12:04 PM, zhoucm1 wrote:
>>>>>> On 2017年05月17日 09:18, Michel Dänzer wrote:
>>>>>>> On 16/05/17 06:25 PM, Chunming Zhou wrote:
>>>>>>>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977
>>>>>>>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>> index bca1fb5..f3e7525 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev,
>>>>>>>> void *data, struct drm_file *filp)
>>>>>>>>          case AMDGPU_VM_OP_UNRESERVE_VMID:
>>>>>>>>              amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm,
>>>>>>>> AMDGPU_GFXHUB);
>>>>>>>>              break;
>>>>>>>> +    case AMDGPU_VM_OP_RESET:
>>>>>>>> +        fpriv->vram_lost_counter =
>>>>>>>> atomic_read(&adev->vram_lost_counter);
>>>>>>>> +        break;
>>>>>>> How do you envision the UMDs using this? I can mostly think of them
>>>>>>> calling this ioctl when a context is created or destroyed. But that
>>>>>>> would also allow any other remaining contexts using the same DRM file
>>>>>>> descriptor to use all ioctls again. So, I think there needs to be a
>>>>>>> vram_lost_counter in struct amdgpu_ctx instead of in struct
>>>>>>> amdgpu_fpriv.
>>>>>> struct amdgpu_fpriv for vram_lost_counter is proper place, especially
>>>>>> for ioctl return value.
>>>>>> if you need to reset ctx one by one, we can mark all contexts of that
>>>>>> vm, and then reset by userspace.
>>>>> I'm not following. With vram_lost_counter in amdgpu_fpriv, if any
>>>>> context calls this ioctl, all other contexts using the same file
>>>>> descriptor will also be considered safe again, right?
>>>> Yes, but it really depends on userspace requirement, if you need to
>>>> reset ctx one by one, we can mark all contexts of that vm to guilty, and
>>>> then reset one context by userspace.
>>> Still not sure what you mean by that.
>>>
>>> E.g. what do you mean by "guilty"? I thought that refers to the context
>>> which caused a hang. But it seems like you're using it to refer to any
>>> context which hasn't reacted yet to VRAM contents being lost.
>> When vram is lost, we treat all contexts need to reset.
> Essentially, your patches only track VRAM contents being lost per file
> descriptor, not per context. I'm not sure (rather skeptical) that this
> is suitable for OpenGL UMDs, since state is usually tracked per context.
> Marek / Nicolai?

Oh, yeah that's a good point.

The problem with tracking it per context is that Vulkan also wants the 
ENODEV on the amdgpu_gem_va_ioct() and amdgpu_info_ioctl() which are 
context less.

But thinking more about this blocking those two doesn't make much sense. 
The VM content can be restored and why should be disallow reading GPU info?

Christian.

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter
       [not found]                                     ` <7a302ebe-1de1-734f-fb21-aadcc7904d37-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-05-17  8:46                                       ` zhoucm1
       [not found]                                         ` <591C0DFB.8030604-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 22+ messages in thread
From: zhoucm1 @ 2017-05-17  8:46 UTC (permalink / raw)
  To: Christian König, Michel Dänzer
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 4220 bytes --]



On 2017年05月17日 16:40, Christian König wrote:
> Am 17.05.2017 um 10:01 schrieb Michel Dänzer:
>> On 17/05/17 04:13 PM, zhoucm1 wrote:
>>> On 2017年05月17日 14:57, Michel Dänzer wrote:
>>>> On 17/05/17 01:28 PM, zhoucm1 wrote:
>>>>> On 2017年05月17日 11:15, Michel Dänzer wrote:
>>>>>> On 17/05/17 12:04 PM, zhoucm1 wrote:
>>>>>>> On 2017年05月17日 09:18, Michel Dänzer wrote:
>>>>>>>> On 16/05/17 06:25 PM, Chunming Zhou wrote:
>>>>>>>>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977
>>>>>>>>> Signed-off-by: Chunming Zhou <David1.Zhou-5C7GfCeVMHo@public.gmane.org>
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>> index bca1fb5..f3e7525 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev,
>>>>>>>>> void *data, struct drm_file *filp)
>>>>>>>>>          case AMDGPU_VM_OP_UNRESERVE_VMID:
>>>>>>>>>              amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm,
>>>>>>>>> AMDGPU_GFXHUB);
>>>>>>>>>              break;
>>>>>>>>> +    case AMDGPU_VM_OP_RESET:
>>>>>>>>> +        fpriv->vram_lost_counter =
>>>>>>>>> atomic_read(&adev->vram_lost_counter);
>>>>>>>>> +        break;
>>>>>>>> How do you envision the UMDs using this? I can mostly think of 
>>>>>>>> them
>>>>>>>> calling this ioctl when a context is created or destroyed. But 
>>>>>>>> that
>>>>>>>> would also allow any other remaining contexts using the same 
>>>>>>>> DRM file
>>>>>>>> descriptor to use all ioctls again. So, I think there needs to 
>>>>>>>> be a
>>>>>>>> vram_lost_counter in struct amdgpu_ctx instead of in struct
>>>>>>>> amdgpu_fpriv.
>>>>>>> struct amdgpu_fpriv for vram_lost_counter is proper place, 
>>>>>>> especially
>>>>>>> for ioctl return value.
>>>>>>> if you need to reset ctx one by one, we can mark all contexts of 
>>>>>>> that
>>>>>>> vm, and then reset by userspace.
>>>>>> I'm not following. With vram_lost_counter in amdgpu_fpriv, if any
>>>>>> context calls this ioctl, all other contexts using the same file
>>>>>> descriptor will also be considered safe again, right?
>>>>> Yes, but it really depends on userspace requirement, if you need to
>>>>> reset ctx one by one, we can mark all contexts of that vm to 
>>>>> guilty, and
>>>>> then reset one context by userspace.
>>>> Still not sure what you mean by that.
>>>>
>>>> E.g. what do you mean by "guilty"? I thought that refers to the 
>>>> context
>>>> which caused a hang. But it seems like you're using it to refer to any
>>>> context which hasn't reacted yet to VRAM contents being lost.
>>> When vram is lost, we treat all contexts need to reset.
>> Essentially, your patches only track VRAM contents being lost per file
>> descriptor, not per context. I'm not sure (rather skeptical) that this
>> is suitable for OpenGL UMDs, since state is usually tracked per context.
>> Marek / Nicolai?
>
> Oh, yeah that's a good point.
>
> The problem with tracking it per context is that Vulkan also wants the 
> ENODEV on the amdgpu_gem_va_ioct() and amdgpu_info_ioctl() which are 
> context less.
>
> But thinking more about this blocking those two doesn't make much 
> sense. The VM content can be restored and why should be disallow 
> reading GPU info?
I can re-paste the Vulkan APIs requiring ENODEV:
"

The Vulkan APIs listed below could return VK_ERROR_DEVICE_LOST according 
to the spec.

I tries to provide a list of u/k interfaces that could be called for 
each vk API.

vkCreateDevice

-amdgpu_device_initialize.

-amdgpu_query_gpu_info

vkQueueSubmit

-amdgpu_cs_submit

vkWaitForFences

                 amdgpu_cs_wait_fences

vkGetEventStatus

vkQueueWaitIdle

vkDeviceWaitIdle

vkGetQueryPoolResults**

                 amdgpu_cs_query_Fence_status

vkQueueBindSparse**

                 amdgpu_bo_va_op

                 amdgpu_bo_va_op_raw

vkCreateSwapchainKHR**

vkAcquireNextImageKHR**

vkQueuePresentKHR

                 Not related with u/k interface.**

**

Besides those listed above, I think 
amdgpu_cs_signal_Sem/amdgpu_cs_wait_sem should respond to gpu reset as 
well."
>
> Christian.
>


[-- Attachment #1.2: Type: text/html, Size: 9346 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter
       [not found]                                         ` <591C0DFB.8030604-5C7GfCeVMHo@public.gmane.org>
@ 2017-05-17  8:55                                           ` Michel Dänzer
  2017-05-17  8:56                                           ` Christian König
  1 sibling, 0 replies; 22+ messages in thread
From: Michel Dänzer @ 2017-05-17  8:55 UTC (permalink / raw)
  To: zhoucm1, Christian König; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 17/05/17 05:46 PM, zhoucm1 wrote:
> 
> 
> On 2017年05月17日 16:40, Christian König wrote:
>> Am 17.05.2017 um 10:01 schrieb Michel Dänzer:
>>> On 17/05/17 04:13 PM, zhoucm1 wrote:
>>>> On 2017年05月17日 14:57, Michel Dänzer wrote:
>>>>> On 17/05/17 01:28 PM, zhoucm1 wrote:
>>>>>> On 2017年05月17日 11:15, Michel Dänzer wrote:
>>>>>>> On 17/05/17 12:04 PM, zhoucm1 wrote:
>>>>>>>> On 2017年05月17日 09:18, Michel Dänzer wrote:
>>>>>>>>> On 16/05/17 06:25 PM, Chunming Zhou wrote:
>>>>>>>>>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977
>>>>>>>>>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>> index bca1fb5..f3e7525 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev,
>>>>>>>>>> void *data, struct drm_file *filp)
>>>>>>>>>>          case AMDGPU_VM_OP_UNRESERVE_VMID:
>>>>>>>>>>              amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm,
>>>>>>>>>> AMDGPU_GFXHUB);
>>>>>>>>>>              break;
>>>>>>>>>> +    case AMDGPU_VM_OP_RESET:
>>>>>>>>>> +        fpriv->vram_lost_counter =
>>>>>>>>>> atomic_read(&adev->vram_lost_counter);
>>>>>>>>>> +        break;
>>>>>>>>> How do you envision the UMDs using this? I can mostly think of
>>>>>>>>> them
>>>>>>>>> calling this ioctl when a context is created or destroyed. But
>>>>>>>>> that
>>>>>>>>> would also allow any other remaining contexts using the same
>>>>>>>>> DRM file
>>>>>>>>> descriptor to use all ioctls again. So, I think there needs to
>>>>>>>>> be a
>>>>>>>>> vram_lost_counter in struct amdgpu_ctx instead of in struct
>>>>>>>>> amdgpu_fpriv.
>>>>>>>> struct amdgpu_fpriv for vram_lost_counter is proper place,
>>>>>>>> especially
>>>>>>>> for ioctl return value.
>>>>>>>> if you need to reset ctx one by one, we can mark all contexts of
>>>>>>>> that
>>>>>>>> vm, and then reset by userspace.
>>>>>>> I'm not following. With vram_lost_counter in amdgpu_fpriv, if any
>>>>>>> context calls this ioctl, all other contexts using the same file
>>>>>>> descriptor will also be considered safe again, right?
>>>>>> Yes, but it really depends on userspace requirement, if you need to
>>>>>> reset ctx one by one, we can mark all contexts of that vm to
>>>>>> guilty, and
>>>>>> then reset one context by userspace.
>>>>> Still not sure what you mean by that.
>>>>>
>>>>> E.g. what do you mean by "guilty"? I thought that refers to the
>>>>> context
>>>>> which caused a hang. But it seems like you're using it to refer to any
>>>>> context which hasn't reacted yet to VRAM contents being lost.
>>>> When vram is lost, we treat all contexts need to reset.
>>> Essentially, your patches only track VRAM contents being lost per file
>>> descriptor, not per context. I'm not sure (rather skeptical) that this
>>> is suitable for OpenGL UMDs, since state is usually tracked per context.
>>> Marek / Nicolai?
>>
>> Oh, yeah that's a good point.
>>
>> The problem with tracking it per context is that Vulkan also wants the
>> ENODEV on the amdgpu_gem_va_ioct() and amdgpu_info_ioctl() which are
>> context less.
>>
>> But thinking more about this blocking those two doesn't make much
>> sense. The VM content can be restored and why should be disallow
>> reading GPU info?
> I can re-paste the Vulkan APIs requiring ENODEV:
> "
> 
> The Vulkan APIs listed below could return VK_ERROR_DEVICE_LOST according
> to the spec.
> 
> I tries to provide a list of u/k interfaces that could be called for
> each vk API.
> 
>  
> 
> vkCreateDevice
> 
> -          amdgpu_device_initialize.
> -          amdgpu_query_gpu_info

[...]

> vkQueueBindSparse**
> 
>                 amdgpu_bo_va_op
>                 amdgpu_bo_va_op_raw

So these *can* return VK_ERROR_DEVICE_LOST, but do they *have to* do
that after a reset? :)


-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter
       [not found]                                         ` <591C0DFB.8030604-5C7GfCeVMHo@public.gmane.org>
  2017-05-17  8:55                                           ` Michel Dänzer
@ 2017-05-17  8:56                                           ` Christian König
       [not found]                                             ` <46582a1e-e019-34ac-1913-ed4a2a992e4c-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  1 sibling, 1 reply; 22+ messages in thread
From: Christian König @ 2017-05-17  8:56 UTC (permalink / raw)
  To: zhoucm1, Michel Dänzer; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 4975 bytes --]

Am 17.05.2017 um 10:46 schrieb zhoucm1:
>
>
> On 2017年05月17日 16:40, Christian König wrote:
>> Am 17.05.2017 um 10:01 schrieb Michel Dänzer:
>>> On 17/05/17 04:13 PM, zhoucm1 wrote:
>>>> On 2017年05月17日 14:57, Michel Dänzer wrote:
>>>>> On 17/05/17 01:28 PM, zhoucm1 wrote:
>>>>>> On 2017年05月17日 11:15, Michel Dänzer wrote:
>>>>>>> On 17/05/17 12:04 PM, zhoucm1 wrote:
>>>>>>>> On 2017年05月17日 09:18, Michel Dänzer wrote:
>>>>>>>>> On 16/05/17 06:25 PM, Chunming Zhou wrote:
>>>>>>>>>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977
>>>>>>>>>> Signed-off-by: Chunming Zhou <David1.Zhou-5C7GfCeVMHo@public.gmane.org>
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>> index bca1fb5..f3e7525 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>>>>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device 
>>>>>>>>>> *dev,
>>>>>>>>>> void *data, struct drm_file *filp)
>>>>>>>>>>          case AMDGPU_VM_OP_UNRESERVE_VMID:
>>>>>>>>>>              amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm,
>>>>>>>>>> AMDGPU_GFXHUB);
>>>>>>>>>>              break;
>>>>>>>>>> +    case AMDGPU_VM_OP_RESET:
>>>>>>>>>> +        fpriv->vram_lost_counter =
>>>>>>>>>> atomic_read(&adev->vram_lost_counter);
>>>>>>>>>> +        break;
>>>>>>>>> How do you envision the UMDs using this? I can mostly think of 
>>>>>>>>> them
>>>>>>>>> calling this ioctl when a context is created or destroyed. But 
>>>>>>>>> that
>>>>>>>>> would also allow any other remaining contexts using the same 
>>>>>>>>> DRM file
>>>>>>>>> descriptor to use all ioctls again. So, I think there needs to 
>>>>>>>>> be a
>>>>>>>>> vram_lost_counter in struct amdgpu_ctx instead of in struct
>>>>>>>>> amdgpu_fpriv.
>>>>>>>> struct amdgpu_fpriv for vram_lost_counter is proper place, 
>>>>>>>> especially
>>>>>>>> for ioctl return value.
>>>>>>>> if you need to reset ctx one by one, we can mark all contexts 
>>>>>>>> of that
>>>>>>>> vm, and then reset by userspace.
>>>>>>> I'm not following. With vram_lost_counter in amdgpu_fpriv, if any
>>>>>>> context calls this ioctl, all other contexts using the same file
>>>>>>> descriptor will also be considered safe again, right?
>>>>>> Yes, but it really depends on userspace requirement, if you need to
>>>>>> reset ctx one by one, we can mark all contexts of that vm to 
>>>>>> guilty, and
>>>>>> then reset one context by userspace.
>>>>> Still not sure what you mean by that.
>>>>>
>>>>> E.g. what do you mean by "guilty"? I thought that refers to the 
>>>>> context
>>>>> which caused a hang. But it seems like you're using it to refer to 
>>>>> any
>>>>> context which hasn't reacted yet to VRAM contents being lost.
>>>> When vram is lost, we treat all contexts need to reset.
>>> Essentially, your patches only track VRAM contents being lost per file
>>> descriptor, not per context. I'm not sure (rather skeptical) that this
>>> is suitable for OpenGL UMDs, since state is usually tracked per 
>>> context.
>>> Marek / Nicolai?
>>
>> Oh, yeah that's a good point.
>>
>> The problem with tracking it per context is that Vulkan also wants 
>> the ENODEV on the amdgpu_gem_va_ioct() and amdgpu_info_ioctl() which 
>> are context less.
>>
>> But thinking more about this blocking those two doesn't make much 
>> sense. The VM content can be restored and why should be disallow 
>> reading GPU info?
> I can re-paste the Vulkan APIs requiring ENODEV:
> "
>
> The Vulkan APIs listed below could return VK_ERROR_DEVICE_LOST 
> according to the spec.
>
> I tries to provide a list of u/k interfaces that could be called for 
> each vk API.
>

Well those are the Vulkan requirements, but that doesn't necessary mean 
we must follow that on the kernel side. Keep in mind that Vulkan can't 
made any requirements towards the kernel driver.

IIRC we already have a query Vulkan can use to figure out if a GPU reset 
happened or not. So they could use that instead.

Regards,
Christian.

> vkCreateDevice
>
> -amdgpu_device_initialize.
>
> -amdgpu_query_gpu_info
>
> vkQueueSubmit
>
> -amdgpu_cs_submit
>
> vkWaitForFences
>
>                 amdgpu_cs_wait_fences
>
> vkGetEventStatus
>
> vkQueueWaitIdle
>
> vkDeviceWaitIdle
>
> vkGetQueryPoolResults**
>
>                 amdgpu_cs_query_Fence_status
>
> vkQueueBindSparse**
>
>                 amdgpu_bo_va_op
>
>                 amdgpu_bo_va_op_raw
>
> vkCreateSwapchainKHR**
>
> vkAcquireNextImageKHR**
>
> vkQueuePresentKHR
>
>                 Not related with u/k interface.**
>
> **
>
> Besides those listed above, I think 
> amdgpu_cs_signal_Sem/amdgpu_cs_wait_sem should respond to gpu reset as 
> well."
>>
>> Christian.
>>
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[-- Attachment #1.2: Type: text/html, Size: 10254 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter
       [not found]                                             ` <46582a1e-e019-34ac-1913-ed4a2a992e4c-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-05-17  9:49                                               ` Marek Olšák
       [not found]                                                 ` <CAAxE2A7sRZXx3MnRSO76DW=61X06nfVs2AHe_a-r+K+46tfJPw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 22+ messages in thread
From: Marek Olšák @ 2017-05-17  9:49 UTC (permalink / raw)
  To: Christian König; +Cc: zhoucm1, Michel Dänzer, amd-gfx mailing list

David,

We already have a query that returns whether a device is lost. It's
called amdgpu_cs_query_reset_state. That should return whether a hang
was caused by a certain context or whether the hang happened but the
context is innocent. You can extend it to accept no context, in which
case it will return either NO_RESET (everything is OK) or
UNKNOWN_RESET (= when a hang happened but the caller didn't specify
the context).

Marek

On Wed, May 17, 2017 at 10:56 AM, Christian König
<deathsimple@vodafone.de> wrote:
> Am 17.05.2017 um 10:46 schrieb zhoucm1:
>
>
>
> On 2017年05月17日 16:40, Christian König wrote:
>
> Am 17.05.2017 um 10:01 schrieb Michel Dänzer:
>
> On 17/05/17 04:13 PM, zhoucm1 wrote:
>
> On 2017年05月17日 14:57, Michel Dänzer wrote:
>
> On 17/05/17 01:28 PM, zhoucm1 wrote:
>
> On 2017年05月17日 11:15, Michel Dänzer wrote:
>
> On 17/05/17 12:04 PM, zhoucm1 wrote:
>
> On 2017年05月17日 09:18, Michel Dänzer wrote:
>
> On 16/05/17 06:25 PM, Chunming Zhou wrote:
>
> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977
> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index bca1fb5..f3e7525 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev,
> void *data, struct drm_file *filp)
>          case AMDGPU_VM_OP_UNRESERVE_VMID:
>              amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm,
> AMDGPU_GFXHUB);
>              break;
> +    case AMDGPU_VM_OP_RESET:
> +        fpriv->vram_lost_counter =
> atomic_read(&adev->vram_lost_counter);
> +        break;
>
> How do you envision the UMDs using this? I can mostly think of them
> calling this ioctl when a context is created or destroyed. But that
> would also allow any other remaining contexts using the same DRM file
> descriptor to use all ioctls again. So, I think there needs to be a
> vram_lost_counter in struct amdgpu_ctx instead of in struct
> amdgpu_fpriv.
>
> struct amdgpu_fpriv for vram_lost_counter is proper place, especially
> for ioctl return value.
> if you need to reset ctx one by one, we can mark all contexts of that
> vm, and then reset by userspace.
>
> I'm not following. With vram_lost_counter in amdgpu_fpriv, if any
> context calls this ioctl, all other contexts using the same file
> descriptor will also be considered safe again, right?
>
> Yes, but it really depends on userspace requirement, if you need to
> reset ctx one by one, we can mark all contexts of that vm to guilty, and
> then reset one context by userspace.
>
> Still not sure what you mean by that.
>
> E.g. what do you mean by "guilty"? I thought that refers to the context
> which caused a hang. But it seems like you're using it to refer to any
> context which hasn't reacted yet to VRAM contents being lost.
>
> When vram is lost, we treat all contexts need to reset.
>
> Essentially, your patches only track VRAM contents being lost per file
> descriptor, not per context. I'm not sure (rather skeptical) that this
> is suitable for OpenGL UMDs, since state is usually tracked per context.
> Marek / Nicolai?
>
>
> Oh, yeah that's a good point.
>
> The problem with tracking it per context is that Vulkan also wants the
> ENODEV on the amdgpu_gem_va_ioct() and amdgpu_info_ioctl() which are context
> less.
>
> But thinking more about this blocking those two doesn't make much sense. The
> VM content can be restored and why should be disallow reading GPU info?
>
> I can re-paste the Vulkan APIs requiring ENODEV:
> "
>
> The Vulkan APIs listed below could return VK_ERROR_DEVICE_LOST according to
> the spec.
>
> I tries to provide a list of u/k interfaces that could be called for each vk
> API.
>
>
> Well those are the Vulkan requirements, but that doesn't necessary mean we
> must follow that on the kernel side. Keep in mind that Vulkan can't made any
> requirements towards the kernel driver.
>
> IIRC we already have a query Vulkan can use to figure out if a GPU reset
> happened or not. So they could use that instead.
>
> Regards,
> Christian.
>
>
>
> vkCreateDevice
>
> -          amdgpu_device_initialize.
>
> -          amdgpu_query_gpu_info
>
>
>
> vkQueueSubmit
>
> -          amdgpu_cs_submit
>
>
>
> vkWaitForFences
>
>                 amdgpu_cs_wait_fences
>
>
>
> vkGetEventStatus
>
> vkQueueWaitIdle
>
> vkDeviceWaitIdle
>
> vkGetQueryPoolResults
>
>                 amdgpu_cs_query_Fence_status
>
>
>
> vkQueueBindSparse
>
>                 amdgpu_bo_va_op
>
>                 amdgpu_bo_va_op_raw
>
>
>
> vkCreateSwapchainKHR
>
> vkAcquireNextImageKHR
>
> vkQueuePresentKHR
>
>                 Not related with u/k interface.
>
>
>
> Besides those listed above, I think amdgpu_cs_signal_Sem/amdgpu_cs_wait_sem
> should respond to gpu reset as well."
>
>
> Christian.
>
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* RE: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter
       [not found]                                                 ` <CAAxE2A7sRZXx3MnRSO76DW=61X06nfVs2AHe_a-r+K+46tfJPw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-17 10:15                                                   ` Zhou, David(ChunMing)
  0 siblings, 0 replies; 22+ messages in thread
From: Zhou, David(ChunMing) @ 2017-05-17 10:15 UTC (permalink / raw)
  To: Marek Olšák, Christian König
  Cc: Michel Dänzer, amd-gfx mailing list

Yes, agree, it's easy to extend it as your like.
For innocent context, which would need us to find out the context of guilty job, which is TODO.

Regards,
David Zhou

-----Original Message-----
From: Marek Olšák [mailto:maraeo@gmail.com] 
Sent: Wednesday, May 17, 2017 5:50 PM
To: Christian König <deathsimple@vodafone.de>
Cc: Zhou, David(ChunMing) <David1.Zhou@amd.com>; Michel Dänzer <michel@daenzer.net>; amd-gfx mailing list <amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter

David,

We already have a query that returns whether a device is lost. It's called amdgpu_cs_query_reset_state. That should return whether a hang was caused by a certain context or whether the hang happened but the context is innocent. You can extend it to accept no context, in which case it will return either NO_RESET (everything is OK) or UNKNOWN_RESET (= when a hang happened but the caller didn't specify the context).

Marek

On Wed, May 17, 2017 at 10:56 AM, Christian König <deathsimple@vodafone.de> wrote:
> Am 17.05.2017 um 10:46 schrieb zhoucm1:
>
>
>
> On 2017年05月17日 16:40, Christian König wrote:
>
> Am 17.05.2017 um 10:01 schrieb Michel Dänzer:
>
> On 17/05/17 04:13 PM, zhoucm1 wrote:
>
> On 2017年05月17日 14:57, Michel Dänzer wrote:
>
> On 17/05/17 01:28 PM, zhoucm1 wrote:
>
> On 2017年05月17日 11:15, Michel Dänzer wrote:
>
> On 17/05/17 12:04 PM, zhoucm1 wrote:
>
> On 2017年05月17日 09:18, Michel Dänzer wrote:
>
> On 16/05/17 06:25 PM, Chunming Zhou wrote:
>
> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977
> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index bca1fb5..f3e7525 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void 
> *data, struct drm_file *filp)
>          case AMDGPU_VM_OP_UNRESERVE_VMID:
>              amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, 
> AMDGPU_GFXHUB);
>              break;
> +    case AMDGPU_VM_OP_RESET:
> +        fpriv->vram_lost_counter =
> atomic_read(&adev->vram_lost_counter);
> +        break;
>
> How do you envision the UMDs using this? I can mostly think of them 
> calling this ioctl when a context is created or destroyed. But that 
> would also allow any other remaining contexts using the same DRM file 
> descriptor to use all ioctls again. So, I think there needs to be a 
> vram_lost_counter in struct amdgpu_ctx instead of in struct 
> amdgpu_fpriv.
>
> struct amdgpu_fpriv for vram_lost_counter is proper place, especially 
> for ioctl return value.
> if you need to reset ctx one by one, we can mark all contexts of that 
> vm, and then reset by userspace.
>
> I'm not following. With vram_lost_counter in amdgpu_fpriv, if any 
> context calls this ioctl, all other contexts using the same file 
> descriptor will also be considered safe again, right?
>
> Yes, but it really depends on userspace requirement, if you need to 
> reset ctx one by one, we can mark all contexts of that vm to guilty, 
> and then reset one context by userspace.
>
> Still not sure what you mean by that.
>
> E.g. what do you mean by "guilty"? I thought that refers to the 
> context which caused a hang. But it seems like you're using it to 
> refer to any context which hasn't reacted yet to VRAM contents being lost.
>
> When vram is lost, we treat all contexts need to reset.
>
> Essentially, your patches only track VRAM contents being lost per file 
> descriptor, not per context. I'm not sure (rather skeptical) that this 
> is suitable for OpenGL UMDs, since state is usually tracked per context.
> Marek / Nicolai?
>
>
> Oh, yeah that's a good point.
>
> The problem with tracking it per context is that Vulkan also wants the 
> ENODEV on the amdgpu_gem_va_ioct() and amdgpu_info_ioctl() which are 
> context less.
>
> But thinking more about this blocking those two doesn't make much 
> sense. The VM content can be restored and why should be disallow reading GPU info?
>
> I can re-paste the Vulkan APIs requiring ENODEV:
> "
>
> The Vulkan APIs listed below could return VK_ERROR_DEVICE_LOST 
> according to the spec.
>
> I tries to provide a list of u/k interfaces that could be called for 
> each vk API.
>
>
> Well those are the Vulkan requirements, but that doesn't necessary 
> mean we must follow that on the kernel side. Keep in mind that Vulkan 
> can't made any requirements towards the kernel driver.
>
> IIRC we already have a query Vulkan can use to figure out if a GPU 
> reset happened or not. So they could use that instead.
>
> Regards,
> Christian.
>
>
>
> vkCreateDevice
>
> -          amdgpu_device_initialize.
>
> -          amdgpu_query_gpu_info
>
>
>
> vkQueueSubmit
>
> -          amdgpu_cs_submit
>
>
>
> vkWaitForFences
>
>                 amdgpu_cs_wait_fences
>
>
>
> vkGetEventStatus
>
> vkQueueWaitIdle
>
> vkDeviceWaitIdle
>
> vkGetQueryPoolResults
>
>                 amdgpu_cs_query_Fence_status
>
>
>
> vkQueueBindSparse
>
>                 amdgpu_bo_va_op
>
>                 amdgpu_bo_va_op_raw
>
>
>
> vkCreateSwapchainKHR
>
> vkAcquireNextImageKHR
>
> vkQueuePresentKHR
>
>                 Not related with u/k interface.
>
>
>
> Besides those listed above, I think 
> amdgpu_cs_signal_Sem/amdgpu_cs_wait_sem
> should respond to gpu reset as well."
>
>
> Christian.
>
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* RE: [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when vram is lost v2
       [not found]     ` <1494926750-1081-2-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>
@ 2017-05-23 15:08       ` Deucher, Alexander
       [not found]         ` <BN6PR12MB1652E1AFF691F58C92FC2CD7F7F90-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 22+ messages in thread
From: Deucher, Alexander @ 2017-05-23 15:08 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Zhou, David(ChunMing)

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Chunming Zhou
> Sent: Tuesday, May 16, 2017 5:26 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhou, David(ChunMing)
> Subject: [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when
> vram is lost v2
> 
> below ioctl will return -ENODEV:
> amdgpu_cs_ioctl
> amdgpu_cs_wait_ioctl
> amdgpu_cs_wait_fences_ioctl
> amdgpu_gem_va_ioctl
> amdgpu_info_ioctl

Do we want to block the info ioctl?  Isn't that where the lost context query is?

Alex

> 
> v2: only for map and replace cases in amdgpu_gem_va_ioctl
> 
> Change-Id: I8970cde3301b7cfeb4263cc0f0e54aece215c98e
> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  9 +++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 +++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    |  5 +++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    | 10 ++++++++++
>  5 files changed, 31 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index f9da215..dcd6203 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -855,6 +855,7 @@ struct amdgpu_fpriv {
>  	struct amdgpu_ctx_mgr	ctx_mgr;
>  	spinlock_t		sem_handles_lock;
>  	struct idr		sem_handles;
> +	u32			vram_lost_counter;
>  };
> 
>  /*
> @@ -1607,6 +1608,7 @@ struct amdgpu_device {
>  	atomic64_t			num_bytes_moved;
>  	atomic64_t			num_evictions;
>  	atomic_t			gpu_reset_counter;
> +	atomic_t			vram_lost_counter;
> 
>  	/* data for buffer migration throttling */
>  	struct {
> @@ -2005,6 +2007,8 @@ static inline void
> amdgpu_unregister_atpx_handler(void) {}
>  extern const struct drm_ioctl_desc amdgpu_ioctls_kms[];
>  extern const int amdgpu_max_kms_ioctl;
> 
> +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
> +			  struct amdgpu_fpriv *fpriv);
>  int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags);
>  int amdgpu_driver_unload_kms(struct drm_device *dev);
>  void amdgpu_driver_lastclose_kms(struct drm_device *dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index b803412..911aa02 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1097,6 +1097,7 @@ static int amdgpu_cs_submit(struct
> amdgpu_cs_parser *p,
>  int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file
> *filp)
>  {
>  	struct amdgpu_device *adev = dev->dev_private;
> +	struct amdgpu_fpriv *fpriv = filp->driver_priv;
>  	union drm_amdgpu_cs *cs = data;
>  	struct amdgpu_cs_parser parser = {};
>  	bool reserved_buffers = false;
> @@ -1104,6 +1105,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void
> *data, struct drm_file *filp)
> 
>  	if (!adev->accel_working)
>  		return -EBUSY;
> +	if (amdgpu_kms_vram_lost(adev, fpriv))
> +		return -ENODEV;
> 
>  	parser.adev = adev;
>  	parser.filp = filp;
> @@ -1165,12 +1168,15 @@ int amdgpu_cs_wait_ioctl(struct drm_device
> *dev, void *data,
>  {
>  	union drm_amdgpu_wait_cs *wait = data;
>  	struct amdgpu_device *adev = dev->dev_private;
> +	struct amdgpu_fpriv *fpriv = filp->driver_priv;
>  	unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout);
>  	struct amdgpu_ring *ring = NULL;
>  	struct amdgpu_ctx *ctx;
>  	struct fence *fence;
>  	long r;
> 
> +	if (amdgpu_kms_vram_lost(adev, fpriv))
> +		return -ENODEV;
>  	r = amdgpu_cs_get_ring(adev, wait->in.ip_type, wait-
> >in.ip_instance,
>  			       wait->in.ring, &ring);
>  	if (r)
> @@ -1344,12 +1350,15 @@ int amdgpu_cs_wait_fences_ioctl(struct
> drm_device *dev, void *data,
>  				struct drm_file *filp)
>  {
>  	struct amdgpu_device *adev = dev->dev_private;
> +	struct amdgpu_fpriv *fpriv = filp->driver_priv;
>  	union drm_amdgpu_wait_fences *wait = data;
>  	uint32_t fence_count = wait->in.fence_count;
>  	struct drm_amdgpu_fence *fences_user;
>  	struct drm_amdgpu_fence *fences;
>  	int r;
> 
> +	if (amdgpu_kms_vram_lost(adev, fpriv))
> +		return -ENODEV;
>  	/* Get the fences from userspace */
>  	fences = kmalloc_array(fence_count, sizeof(struct
> drm_amdgpu_fence),
>  			GFP_KERNEL);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index c56ae4a..2f0fcf8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2913,8 +2913,10 @@ int amdgpu_gpu_reset(struct amdgpu_device
> *adev)
>  			if (r)
>  				goto out;
>  			vram_lost = amdgpu_check_vram_lost(adev);
> -			if (vram_lost)
> +			if (vram_lost) {
>  				DRM_ERROR("VRAM is lost!\n");
> +				atomic_inc(&adev->vram_lost_counter);
> +			}
>  			r = amdgpu_ttm_recover_gart(adev);
>  			if (r)
>  				goto out;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index d8275ef..83bc94c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -802,6 +802,11 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev,
> void *data,
>  			args->operation);
>  		return -EINVAL;
>  	}
> +	if ((args->operation == AMDGPU_VA_OP_MAP) ||
> +	    (args->operation == AMDGPU_VA_OP_REPLACE)) {
> +		if (amdgpu_kms_vram_lost(adev, fpriv))
> +			return -ENODEV;
> +	}
> 
>  	INIT_LIST_HEAD(&list);
>  	if ((args->operation != AMDGPU_VA_OP_CLEAR) &&
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index 368829a..a231aa1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -235,6 +235,7 @@ static int amdgpu_firmware_info(struct
> drm_amdgpu_info_firmware *fw_info,
>  static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct
> drm_file *filp)
>  {
>  	struct amdgpu_device *adev = dev->dev_private;
> +	struct amdgpu_fpriv *fpriv = filp->driver_priv;
>  	struct drm_amdgpu_info *info = data;
>  	struct amdgpu_mode_info *minfo = &adev->mode_info;
>  	void __user *out = (void __user *)(uintptr_t)info->return_pointer;
> @@ -247,6 +248,8 @@ static int amdgpu_info_ioctl(struct drm_device *dev,
> void *data, struct drm_file
> 
>  	if (!info->return_size || !info->return_pointer)
>  		return -EINVAL;
> +	if (amdgpu_kms_vram_lost(adev, fpriv))
> +		return -ENODEV;
> 
>  	switch (info->query) {
>  	case AMDGPU_INFO_VIRTUAL_RANGE: {
> @@ -779,6 +782,12 @@ void amdgpu_driver_lastclose_kms(struct
> drm_device *dev)
>  	vga_switcheroo_process_delayed_switch();
>  }
> 
> +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
> +			  struct amdgpu_fpriv *fpriv)
> +{
> +	return fpriv->vram_lost_counter != atomic_read(&adev-
> >vram_lost_counter);
> +}
> +
>  /**
>   * amdgpu_driver_open_kms - drm callback for open
>   *
> @@ -833,6 +842,7 @@ int amdgpu_driver_open_kms(struct drm_device
> *dev, struct drm_file *file_priv)
> 
>  	amdgpu_ctx_mgr_init(&fpriv->ctx_mgr);
> 
> +	fpriv->vram_lost_counter = atomic_read(&adev-
> >vram_lost_counter);
>  	file_priv->driver_priv = fpriv;
> 
>  out_suspend:
> --
> 1.9.1
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when vram is lost v2
       [not found]         ` <BN6PR12MB1652E1AFF691F58C92FC2CD7F7F90-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-05-23 15:16           ` Christian König
       [not found]             ` <69717c0b-b2c1-589a-c466-5d6be9518eda-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 22+ messages in thread
From: Christian König @ 2017-05-23 15:16 UTC (permalink / raw)
  To: Deucher, Alexander, Zhou, David(ChunMing),
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 23.05.2017 um 17:08 schrieb Deucher, Alexander:
>> -----Original Message-----
>> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
>> Of Chunming Zhou
>> Sent: Tuesday, May 16, 2017 5:26 AM
>> To: amd-gfx@lists.freedesktop.org
>> Cc: Zhou, David(ChunMing)
>> Subject: [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when
>> vram is lost v2
>>
>> below ioctl will return -ENODEV:
>> amdgpu_cs_ioctl
>> amdgpu_cs_wait_ioctl
>> amdgpu_cs_wait_fences_ioctl
>> amdgpu_gem_va_ioctl
>> amdgpu_info_ioctl
> Do we want to block the info ioctl?  Isn't that where the lost context query is?

No, that's amdgpu_ctx_ioctl.

But I think the conclusion is that we want to move the vram_lost counter 
to be per CTX and not per device.

Christian.

>
> Alex
>
>> v2: only for map and replace cases in amdgpu_gem_va_ioctl
>>
>> Change-Id: I8970cde3301b7cfeb4263cc0f0e54aece215c98e
>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  9 +++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 +++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    |  5 +++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    | 10 ++++++++++
>>   5 files changed, 31 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index f9da215..dcd6203 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -855,6 +855,7 @@ struct amdgpu_fpriv {
>>   	struct amdgpu_ctx_mgr	ctx_mgr;
>>   	spinlock_t		sem_handles_lock;
>>   	struct idr		sem_handles;
>> +	u32			vram_lost_counter;
>>   };
>>
>>   /*
>> @@ -1607,6 +1608,7 @@ struct amdgpu_device {
>>   	atomic64_t			num_bytes_moved;
>>   	atomic64_t			num_evictions;
>>   	atomic_t			gpu_reset_counter;
>> +	atomic_t			vram_lost_counter;
>>
>>   	/* data for buffer migration throttling */
>>   	struct {
>> @@ -2005,6 +2007,8 @@ static inline void
>> amdgpu_unregister_atpx_handler(void) {}
>>   extern const struct drm_ioctl_desc amdgpu_ioctls_kms[];
>>   extern const int amdgpu_max_kms_ioctl;
>>
>> +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
>> +			  struct amdgpu_fpriv *fpriv);
>>   int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags);
>>   int amdgpu_driver_unload_kms(struct drm_device *dev);
>>   void amdgpu_driver_lastclose_kms(struct drm_device *dev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index b803412..911aa02 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -1097,6 +1097,7 @@ static int amdgpu_cs_submit(struct
>> amdgpu_cs_parser *p,
>>   int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file
>> *filp)
>>   {
>>   	struct amdgpu_device *adev = dev->dev_private;
>> +	struct amdgpu_fpriv *fpriv = filp->driver_priv;
>>   	union drm_amdgpu_cs *cs = data;
>>   	struct amdgpu_cs_parser parser = {};
>>   	bool reserved_buffers = false;
>> @@ -1104,6 +1105,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void
>> *data, struct drm_file *filp)
>>
>>   	if (!adev->accel_working)
>>   		return -EBUSY;
>> +	if (amdgpu_kms_vram_lost(adev, fpriv))
>> +		return -ENODEV;
>>
>>   	parser.adev = adev;
>>   	parser.filp = filp;
>> @@ -1165,12 +1168,15 @@ int amdgpu_cs_wait_ioctl(struct drm_device
>> *dev, void *data,
>>   {
>>   	union drm_amdgpu_wait_cs *wait = data;
>>   	struct amdgpu_device *adev = dev->dev_private;
>> +	struct amdgpu_fpriv *fpriv = filp->driver_priv;
>>   	unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout);
>>   	struct amdgpu_ring *ring = NULL;
>>   	struct amdgpu_ctx *ctx;
>>   	struct fence *fence;
>>   	long r;
>>
>> +	if (amdgpu_kms_vram_lost(adev, fpriv))
>> +		return -ENODEV;
>>   	r = amdgpu_cs_get_ring(adev, wait->in.ip_type, wait-
>>> in.ip_instance,
>>   			       wait->in.ring, &ring);
>>   	if (r)
>> @@ -1344,12 +1350,15 @@ int amdgpu_cs_wait_fences_ioctl(struct
>> drm_device *dev, void *data,
>>   				struct drm_file *filp)
>>   {
>>   	struct amdgpu_device *adev = dev->dev_private;
>> +	struct amdgpu_fpriv *fpriv = filp->driver_priv;
>>   	union drm_amdgpu_wait_fences *wait = data;
>>   	uint32_t fence_count = wait->in.fence_count;
>>   	struct drm_amdgpu_fence *fences_user;
>>   	struct drm_amdgpu_fence *fences;
>>   	int r;
>>
>> +	if (amdgpu_kms_vram_lost(adev, fpriv))
>> +		return -ENODEV;
>>   	/* Get the fences from userspace */
>>   	fences = kmalloc_array(fence_count, sizeof(struct
>> drm_amdgpu_fence),
>>   			GFP_KERNEL);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index c56ae4a..2f0fcf8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2913,8 +2913,10 @@ int amdgpu_gpu_reset(struct amdgpu_device
>> *adev)
>>   			if (r)
>>   				goto out;
>>   			vram_lost = amdgpu_check_vram_lost(adev);
>> -			if (vram_lost)
>> +			if (vram_lost) {
>>   				DRM_ERROR("VRAM is lost!\n");
>> +				atomic_inc(&adev->vram_lost_counter);
>> +			}
>>   			r = amdgpu_ttm_recover_gart(adev);
>>   			if (r)
>>   				goto out;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> index d8275ef..83bc94c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> @@ -802,6 +802,11 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev,
>> void *data,
>>   			args->operation);
>>   		return -EINVAL;
>>   	}
>> +	if ((args->operation == AMDGPU_VA_OP_MAP) ||
>> +	    (args->operation == AMDGPU_VA_OP_REPLACE)) {
>> +		if (amdgpu_kms_vram_lost(adev, fpriv))
>> +			return -ENODEV;
>> +	}
>>
>>   	INIT_LIST_HEAD(&list);
>>   	if ((args->operation != AMDGPU_VA_OP_CLEAR) &&
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> index 368829a..a231aa1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> @@ -235,6 +235,7 @@ static int amdgpu_firmware_info(struct
>> drm_amdgpu_info_firmware *fw_info,
>>   static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct
>> drm_file *filp)
>>   {
>>   	struct amdgpu_device *adev = dev->dev_private;
>> +	struct amdgpu_fpriv *fpriv = filp->driver_priv;
>>   	struct drm_amdgpu_info *info = data;
>>   	struct amdgpu_mode_info *minfo = &adev->mode_info;
>>   	void __user *out = (void __user *)(uintptr_t)info->return_pointer;
>> @@ -247,6 +248,8 @@ static int amdgpu_info_ioctl(struct drm_device *dev,
>> void *data, struct drm_file
>>
>>   	if (!info->return_size || !info->return_pointer)
>>   		return -EINVAL;
>> +	if (amdgpu_kms_vram_lost(adev, fpriv))
>> +		return -ENODEV;
>>
>>   	switch (info->query) {
>>   	case AMDGPU_INFO_VIRTUAL_RANGE: {
>> @@ -779,6 +782,12 @@ void amdgpu_driver_lastclose_kms(struct
>> drm_device *dev)
>>   	vga_switcheroo_process_delayed_switch();
>>   }
>>
>> +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
>> +			  struct amdgpu_fpriv *fpriv)
>> +{
>> +	return fpriv->vram_lost_counter != atomic_read(&adev-
>>> vram_lost_counter);
>> +}
>> +
>>   /**
>>    * amdgpu_driver_open_kms - drm callback for open
>>    *
>> @@ -833,6 +842,7 @@ int amdgpu_driver_open_kms(struct drm_device
>> *dev, struct drm_file *file_priv)
>>
>>   	amdgpu_ctx_mgr_init(&fpriv->ctx_mgr);
>>
>> +	fpriv->vram_lost_counter = atomic_read(&adev-
>>> vram_lost_counter);
>>   	file_priv->driver_priv = fpriv;
>>
>>   out_suspend:
>> --
>> 1.9.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when vram is lost v2
       [not found]             ` <69717c0b-b2c1-589a-c466-5d6be9518eda-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-05-24  2:20               ` zhoucm1
  0 siblings, 0 replies; 22+ messages in thread
From: zhoucm1 @ 2017-05-24  2:20 UTC (permalink / raw)
  To: Christian König, Deucher, Alexander,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



On 2017年05月23日 23:16, Christian König wrote:
> Am 23.05.2017 um 17:08 schrieb Deucher, Alexander:
>>> -----Original Message-----
>>> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
>>> Of Chunming Zhou
>>> Sent: Tuesday, May 16, 2017 5:26 AM
>>> To: amd-gfx@lists.freedesktop.org
>>> Cc: Zhou, David(ChunMing)
>>> Subject: [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when
>>> vram is lost v2
>>>
>>> below ioctl will return -ENODEV:
>>> amdgpu_cs_ioctl
>>> amdgpu_cs_wait_ioctl
>>> amdgpu_cs_wait_fences_ioctl
>>> amdgpu_gem_va_ioctl
>>> amdgpu_info_ioctl
>> Do we want to block the info ioctl?  Isn't that where the lost 
>> context query is?
>
> No, that's amdgpu_ctx_ioctl.
>
> But I think the conclusion is that we want to move the vram_lost 
> counter to be per CTX and not per device.
Yes, Monk is working on it for virt case, after it, I think we can reuse it.

Regards,
David zhou
>
> Christian.
>
>>
>> Alex
>>
>>> v2: only for map and replace cases in amdgpu_gem_va_ioctl
>>>
>>> Change-Id: I8970cde3301b7cfeb4263cc0f0e54aece215c98e
>>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  9 +++++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 +++-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    |  5 +++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    | 10 ++++++++++
>>>   5 files changed, 31 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index f9da215..dcd6203 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -855,6 +855,7 @@ struct amdgpu_fpriv {
>>>       struct amdgpu_ctx_mgr    ctx_mgr;
>>>       spinlock_t        sem_handles_lock;
>>>       struct idr        sem_handles;
>>> +    u32            vram_lost_counter;
>>>   };
>>>
>>>   /*
>>> @@ -1607,6 +1608,7 @@ struct amdgpu_device {
>>>       atomic64_t            num_bytes_moved;
>>>       atomic64_t            num_evictions;
>>>       atomic_t            gpu_reset_counter;
>>> +    atomic_t            vram_lost_counter;
>>>
>>>       /* data for buffer migration throttling */
>>>       struct {
>>> @@ -2005,6 +2007,8 @@ static inline void
>>> amdgpu_unregister_atpx_handler(void) {}
>>>   extern const struct drm_ioctl_desc amdgpu_ioctls_kms[];
>>>   extern const int amdgpu_max_kms_ioctl;
>>>
>>> +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
>>> +              struct amdgpu_fpriv *fpriv);
>>>   int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long 
>>> flags);
>>>   int amdgpu_driver_unload_kms(struct drm_device *dev);
>>>   void amdgpu_driver_lastclose_kms(struct drm_device *dev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index b803412..911aa02 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -1097,6 +1097,7 @@ static int amdgpu_cs_submit(struct
>>> amdgpu_cs_parser *p,
>>>   int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct 
>>> drm_file
>>> *filp)
>>>   {
>>>       struct amdgpu_device *adev = dev->dev_private;
>>> +    struct amdgpu_fpriv *fpriv = filp->driver_priv;
>>>       union drm_amdgpu_cs *cs = data;
>>>       struct amdgpu_cs_parser parser = {};
>>>       bool reserved_buffers = false;
>>> @@ -1104,6 +1105,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void
>>> *data, struct drm_file *filp)
>>>
>>>       if (!adev->accel_working)
>>>           return -EBUSY;
>>> +    if (amdgpu_kms_vram_lost(adev, fpriv))
>>> +        return -ENODEV;
>>>
>>>       parser.adev = adev;
>>>       parser.filp = filp;
>>> @@ -1165,12 +1168,15 @@ int amdgpu_cs_wait_ioctl(struct drm_device
>>> *dev, void *data,
>>>   {
>>>       union drm_amdgpu_wait_cs *wait = data;
>>>       struct amdgpu_device *adev = dev->dev_private;
>>> +    struct amdgpu_fpriv *fpriv = filp->driver_priv;
>>>       unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout);
>>>       struct amdgpu_ring *ring = NULL;
>>>       struct amdgpu_ctx *ctx;
>>>       struct fence *fence;
>>>       long r;
>>>
>>> +    if (amdgpu_kms_vram_lost(adev, fpriv))
>>> +        return -ENODEV;
>>>       r = amdgpu_cs_get_ring(adev, wait->in.ip_type, wait-
>>>> in.ip_instance,
>>>                      wait->in.ring, &ring);
>>>       if (r)
>>> @@ -1344,12 +1350,15 @@ int amdgpu_cs_wait_fences_ioctl(struct
>>> drm_device *dev, void *data,
>>>                   struct drm_file *filp)
>>>   {
>>>       struct amdgpu_device *adev = dev->dev_private;
>>> +    struct amdgpu_fpriv *fpriv = filp->driver_priv;
>>>       union drm_amdgpu_wait_fences *wait = data;
>>>       uint32_t fence_count = wait->in.fence_count;
>>>       struct drm_amdgpu_fence *fences_user;
>>>       struct drm_amdgpu_fence *fences;
>>>       int r;
>>>
>>> +    if (amdgpu_kms_vram_lost(adev, fpriv))
>>> +        return -ENODEV;
>>>       /* Get the fences from userspace */
>>>       fences = kmalloc_array(fence_count, sizeof(struct
>>> drm_amdgpu_fence),
>>>               GFP_KERNEL);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index c56ae4a..2f0fcf8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2913,8 +2913,10 @@ int amdgpu_gpu_reset(struct amdgpu_device
>>> *adev)
>>>               if (r)
>>>                   goto out;
>>>               vram_lost = amdgpu_check_vram_lost(adev);
>>> -            if (vram_lost)
>>> +            if (vram_lost) {
>>>                   DRM_ERROR("VRAM is lost!\n");
>>> +                atomic_inc(&adev->vram_lost_counter);
>>> +            }
>>>               r = amdgpu_ttm_recover_gart(adev);
>>>               if (r)
>>>                   goto out;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> index d8275ef..83bc94c 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> @@ -802,6 +802,11 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev,
>>> void *data,
>>>               args->operation);
>>>           return -EINVAL;
>>>       }
>>> +    if ((args->operation == AMDGPU_VA_OP_MAP) ||
>>> +        (args->operation == AMDGPU_VA_OP_REPLACE)) {
>>> +        if (amdgpu_kms_vram_lost(adev, fpriv))
>>> +            return -ENODEV;
>>> +    }
>>>
>>>       INIT_LIST_HEAD(&list);
>>>       if ((args->operation != AMDGPU_VA_OP_CLEAR) &&
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> index 368829a..a231aa1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> @@ -235,6 +235,7 @@ static int amdgpu_firmware_info(struct
>>> drm_amdgpu_info_firmware *fw_info,
>>>   static int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
>>> struct
>>> drm_file *filp)
>>>   {
>>>       struct amdgpu_device *adev = dev->dev_private;
>>> +    struct amdgpu_fpriv *fpriv = filp->driver_priv;
>>>       struct drm_amdgpu_info *info = data;
>>>       struct amdgpu_mode_info *minfo = &adev->mode_info;
>>>       void __user *out = (void __user 
>>> *)(uintptr_t)info->return_pointer;
>>> @@ -247,6 +248,8 @@ static int amdgpu_info_ioctl(struct drm_device 
>>> *dev,
>>> void *data, struct drm_file
>>>
>>>       if (!info->return_size || !info->return_pointer)
>>>           return -EINVAL;
>>> +    if (amdgpu_kms_vram_lost(adev, fpriv))
>>> +        return -ENODEV;
>>>
>>>       switch (info->query) {
>>>       case AMDGPU_INFO_VIRTUAL_RANGE: {
>>> @@ -779,6 +782,12 @@ void amdgpu_driver_lastclose_kms(struct
>>> drm_device *dev)
>>>       vga_switcheroo_process_delayed_switch();
>>>   }
>>>
>>> +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
>>> +              struct amdgpu_fpriv *fpriv)
>>> +{
>>> +    return fpriv->vram_lost_counter != atomic_read(&adev-
>>>> vram_lost_counter);
>>> +}
>>> +
>>>   /**
>>>    * amdgpu_driver_open_kms - drm callback for open
>>>    *
>>> @@ -833,6 +842,7 @@ int amdgpu_driver_open_kms(struct drm_device
>>> *dev, struct drm_file *file_priv)
>>>
>>>       amdgpu_ctx_mgr_init(&fpriv->ctx_mgr);
>>>
>>> +    fpriv->vram_lost_counter = atomic_read(&adev-
>>>> vram_lost_counter);
>>>       file_priv->driver_priv = fpriv;
>>>
>>>   out_suspend:
>>> -- 
>>> 1.9.1
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2017-05-24  2:20 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-16  9:25 [PATCH 1/4] drm/amdgpu: check if vram is lost v2 Chunming Zhou
     [not found] ` <1494926750-1081-1-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>
2017-05-16  9:25   ` [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when " Chunming Zhou
     [not found]     ` <1494926750-1081-2-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>
2017-05-23 15:08       ` Deucher, Alexander
     [not found]         ` <BN6PR12MB1652E1AFF691F58C92FC2CD7F7F90-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-05-23 15:16           ` Christian König
     [not found]             ` <69717c0b-b2c1-589a-c466-5d6be9518eda-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-05-24  2:20               ` zhoucm1
2017-05-16  9:25   ` [PATCH 3/4] drm/amdgpu: skip all jobs of guilty vm Chunming Zhou
2017-05-16  9:25   ` [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter Chunming Zhou
     [not found]     ` <1494926750-1081-4-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>
2017-05-17  1:18       ` Michel Dänzer
     [not found]         ` <58988726-543a-535a-3011-860d29b9f2da-otUistvHUpPR7s880joybQ@public.gmane.org>
2017-05-17  3:04           ` zhoucm1
     [not found]             ` <591BBDA2.1070900-5C7GfCeVMHo@public.gmane.org>
2017-05-17  3:15               ` Michel Dänzer
     [not found]                 ` <29fe2142-7fd1-e23a-49d9-c38dc685db92-otUistvHUpPR7s880joybQ@public.gmane.org>
2017-05-17  4:28                   ` zhoucm1
     [not found]                     ` <591BD17C.8050903-5C7GfCeVMHo@public.gmane.org>
2017-05-17  6:57                       ` Michel Dänzer
     [not found]                         ` <7d87bc8e-9c09-ad25-de6e-dfbd8116bf6e-otUistvHUpPR7s880joybQ@public.gmane.org>
2017-05-17  7:13                           ` zhoucm1
     [not found]                             ` <591BF825.6090505-5C7GfCeVMHo@public.gmane.org>
2017-05-17  8:01                               ` Michel Dänzer
     [not found]                                 ` <31db7a30-dd98-5cb2-4125-187d3d0e2a49-otUistvHUpPR7s880joybQ@public.gmane.org>
2017-05-17  8:40                                   ` Christian König
     [not found]                                     ` <7a302ebe-1de1-734f-fb21-aadcc7904d37-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-05-17  8:46                                       ` zhoucm1
     [not found]                                         ` <591C0DFB.8030604-5C7GfCeVMHo@public.gmane.org>
2017-05-17  8:55                                           ` Michel Dänzer
2017-05-17  8:56                                           ` Christian König
     [not found]                                             ` <46582a1e-e019-34ac-1913-ed4a2a992e4c-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-05-17  9:49                                               ` Marek Olšák
     [not found]                                                 ` <CAAxE2A7sRZXx3MnRSO76DW=61X06nfVs2AHe_a-r+K+46tfJPw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-17 10:15                                                   ` Zhou, David(ChunMing)
2017-05-16 10:49   ` [PATCH 1/4] drm/amdgpu: check if vram is lost v2 Christian König
     [not found]     ` <0c1d89c5-65ea-cdad-100a-80d0377b865c-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-05-17  4:37       ` zhoucm1

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.