All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/sched: Add callback to mark if sched is ready to work.
@ 2018-10-18 18:44 Andrey Grodzovsky
  2018-10-18 18:44 ` [PATCH 2/3] drm/sched: Expose drm_sched_entity_select_rq to use in drivers Andrey Grodzovsky
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Andrey Grodzovsky @ 2018-10-18 18:44 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Alexander.Deucher-5C7GfCeVMHo, Andrey Grodzovsky,
	christian.koenig-5C7GfCeVMHo

Problem:
A particular scheduler may become unsuable (underlying HW) after
some event (e.g. GPU reset). If it's later chosen by
the get free sched. policy a command will fail to be
submitted.

Fix:
Add a driver specific callback to report the sced. status so
rq with bad sched. can be avoided in favor of working one or
none in which case job init will fail.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  9 ++++++++-
 drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
 drivers/gpu/drm/scheduler/sched_entity.c  | 11 ++++++++++-
 drivers/gpu/drm/scheduler/sched_main.c    |  8 +++++++-
 drivers/gpu/drm/v3d/v3d_sched.c           |  3 ++-
 include/drm/gpu_scheduler.h               |  5 ++++-
 6 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 5448cf2..bbfe7f501 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -404,6 +404,13 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
 	return 0;
 }
 
+static bool amdgpu_ring_ready(struct drm_gpu_scheduler *sched)
+{
+	struct amdgpu_ring *ring = to_amdgpu_ring(sched);
+
+	return ring->ready;
+}
+
 /**
  * amdgpu_fence_driver_init_ring - init the fence driver
  * for the requested ring.
@@ -450,7 +457,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
 
 		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
 				   num_hw_submission, amdgpu_job_hang_limit,
-				   timeout, ring->name);
+				   timeout, ring->name, amdgpu_ring_ready);
 		if (r) {
 			DRM_ERROR("Failed to create scheduler on ring %s.\n",
 				  ring->name);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index f8c5f1e..5094013 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -178,7 +178,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
 
 	ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
 			     etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
-			     msecs_to_jiffies(500), dev_name(gpu->dev));
+			     msecs_to_jiffies(500), dev_name(gpu->dev),
+				 NULL);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 3e22a54..320c77a 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -130,7 +130,16 @@ drm_sched_entity_get_free_sched(struct drm_sched_entity *entity)
 	int i;
 
 	for (i = 0; i < entity->num_rq_list; ++i) {
-		num_jobs = atomic_read(&entity->rq_list[i]->sched->num_jobs);
+		struct drm_gpu_scheduler *sched = entity->rq_list[i]->sched;
+
+		if (entity->rq_list[i]->sched->ready &&
+				!entity->rq_list[i]->sched->ready(sched)) {
+			DRM_WARN("sched%s is not ready, skipping", sched->name);
+
+			continue;
+		}
+
+		num_jobs = atomic_read(&sched->num_jobs);
 		if (num_jobs < min_jobs) {
 			min_jobs = num_jobs;
 			rq = entity->rq_list[i];
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 63b997d..6b151f2 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -420,6 +420,9 @@ int drm_sched_job_init(struct drm_sched_job *job,
 	struct drm_gpu_scheduler *sched;
 
 	drm_sched_entity_select_rq(entity);
+	if (!entity->rq)
+		return -ENOENT;
+
 	sched = entity->rq->sched;
 
 	job->sched = sched;
@@ -606,7 +609,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
 		   unsigned hw_submission,
 		   unsigned hang_limit,
 		   long timeout,
-		   const char *name)
+		   const char *name,
+		   bool (*ready)(struct drm_gpu_scheduler *sched))
 {
 	int i;
 	sched->ops = ops;
@@ -633,6 +637,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
 		return PTR_ERR(sched->thread);
 	}
 
+	sched->ready = ready;
+
 	return 0;
 }
 EXPORT_SYMBOL(drm_sched_init);
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index 80b641f..e06cc0d 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -212,7 +212,8 @@ v3d_sched_init(struct v3d_dev *v3d)
 			     &v3d_sched_ops,
 			     hw_jobs_limit, job_hang_limit,
 			     msecs_to_jiffies(hang_limit_ms),
-			     "v3d_bin");
+			     "v3d_bin",
+				 NULL);
 	if (ret) {
 		dev_err(v3d->dev, "Failed to create bin scheduler: %d.", ret);
 		return ret;
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 0684dcd..a6e18fe 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -283,12 +283,15 @@ struct drm_gpu_scheduler {
 	spinlock_t			job_list_lock;
 	int				hang_limit;
 	atomic_t                        num_jobs;
+
+	bool 			(*ready)(struct drm_gpu_scheduler *sched);
 };
 
 int drm_sched_init(struct drm_gpu_scheduler *sched,
 		   const struct drm_sched_backend_ops *ops,
 		   uint32_t hw_submission, unsigned hang_limit, long timeout,
-		   const char *name);
+		   const char *name,
+		   bool (*ready)(struct drm_gpu_scheduler *sched));
 void drm_sched_fini(struct drm_gpu_scheduler *sched);
 int drm_sched_job_init(struct drm_sched_job *job,
 		       struct drm_sched_entity *entity,
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/3] drm/sched: Expose drm_sched_entity_select_rq to use in drivers.
  2018-10-18 18:44 [PATCH 1/3] drm/sched: Add callback to mark if sched is ready to work Andrey Grodzovsky
@ 2018-10-18 18:44 ` Andrey Grodzovsky
  2018-10-19  7:03 ` [PATCH 1/3] drm/sched: Add callback to mark if sched is ready to work Koenig, Christian
       [not found] ` <1539888261-331-1-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
  2 siblings, 0 replies; 10+ messages in thread
From: Andrey Grodzovsky @ 2018-10-18 18:44 UTC (permalink / raw)
  To: amd-gfx, dri-devel; +Cc: Alexander.Deucher, christian.koenig

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/scheduler/sched_entity.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 320c77a..426606c 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -492,6 +492,7 @@ void drm_sched_entity_select_rq(struct drm_sched_entity *entity)
 	entity->rq = rq;
 	spin_unlock(&entity->rq_lock);
 }
+EXPORT_SYMBOL(drm_sched_entity_select_rq);
 
 /**
  * drm_sched_entity_push_job - Submit a job to the entity's job queue
-- 
2.7.4

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/3] drm/amdgpu: Refresh rq selection for job after ASIC reset
       [not found] ` <1539888261-331-1-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
@ 2018-10-18 18:44   ` Andrey Grodzovsky
       [not found]     ` <1539888261-331-3-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
  2018-10-19  7:13   ` [PATCH 1/3] drm/sched: Add callback to mark if sched is ready to work Michel Dänzer
  1 sibling, 1 reply; 10+ messages in thread
From: Andrey Grodzovsky @ 2018-10-18 18:44 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Alexander.Deucher-5C7GfCeVMHo, Andrey Grodzovsky,
	christian.koenig-5C7GfCeVMHo

A ring might become unusable after reset, if that the case
drm_sched_entity_select_rq will choose another, working rq
to run the job if there is one.
Also, skip recovery of ring which is not ready.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index d11489e..3124ca1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3355,10 +3355,24 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	else
 		r = amdgpu_device_reset(adev);
 
+	/*
+	 * After reboot a ring might fail in which case this will
+	 * move the job to different rq if possible
+	 */
+	if (job) {
+		drm_sched_entity_select_rq(job->base.entity);
+		if (job->base.entity->rq) {
+			job->base.sched = job->base.entity->rq->sched;
+		} else {
+			job->base.sched = NULL;
+			r = -ENOENT;
+		}
+	}
+
 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 		struct amdgpu_ring *ring = adev->rings[i];
 
-		if (!ring || !ring->sched.thread)
+		if (!ring || !ring->ready || !ring->sched.thread)
 			continue;
 
 		/* only need recovery sched of the given job's ring
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/3] drm/sched: Add callback to mark if sched is ready to work.
  2018-10-18 18:44 [PATCH 1/3] drm/sched: Add callback to mark if sched is ready to work Andrey Grodzovsky
  2018-10-18 18:44 ` [PATCH 2/3] drm/sched: Expose drm_sched_entity_select_rq to use in drivers Andrey Grodzovsky
@ 2018-10-19  7:03 ` Koenig, Christian
       [not found] ` <1539888261-331-1-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
  2 siblings, 0 replies; 10+ messages in thread
From: Koenig, Christian @ 2018-10-19  7:03 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx, dri-devel; +Cc: Deucher, Alexander

Am 18.10.18 um 20:44 schrieb Andrey Grodzovsky:
> Problem:
> A particular scheduler may become unsuable (underlying HW) after
> some event (e.g. GPU reset). If it's later chosen by
> the get free sched. policy a command will fail to be
> submitted.
>
> Fix:
> Add a driver specific callback to report the sced. status so
> rq with bad sched. can be avoided in favor of working one or
> none in which case job init will fail.

I would rather go with the approach of moving the ready flag completely 
into the scheduler instead.

Christian.

>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  9 ++++++++-
>   drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
>   drivers/gpu/drm/scheduler/sched_entity.c  | 11 ++++++++++-
>   drivers/gpu/drm/scheduler/sched_main.c    |  8 +++++++-
>   drivers/gpu/drm/v3d/v3d_sched.c           |  3 ++-
>   include/drm/gpu_scheduler.h               |  5 ++++-
>   6 files changed, 33 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 5448cf2..bbfe7f501 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -404,6 +404,13 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
>   	return 0;
>   }
>   
> +static bool amdgpu_ring_ready(struct drm_gpu_scheduler *sched)
> +{
> +	struct amdgpu_ring *ring = to_amdgpu_ring(sched);
> +
> +	return ring->ready;
> +}
> +
>   /**
>    * amdgpu_fence_driver_init_ring - init the fence driver
>    * for the requested ring.
> @@ -450,7 +457,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>   
>   		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>   				   num_hw_submission, amdgpu_job_hang_limit,
> -				   timeout, ring->name);
> +				   timeout, ring->name, amdgpu_ring_ready);
>   		if (r) {
>   			DRM_ERROR("Failed to create scheduler on ring %s.\n",
>   				  ring->name);
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> index f8c5f1e..5094013 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> @@ -178,7 +178,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>   
>   	ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>   			     etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
> -			     msecs_to_jiffies(500), dev_name(gpu->dev));
> +			     msecs_to_jiffies(500), dev_name(gpu->dev),
> +				 NULL);
>   	if (ret)
>   		return ret;
>   
> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
> index 3e22a54..320c77a 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> @@ -130,7 +130,16 @@ drm_sched_entity_get_free_sched(struct drm_sched_entity *entity)
>   	int i;
>   
>   	for (i = 0; i < entity->num_rq_list; ++i) {
> -		num_jobs = atomic_read(&entity->rq_list[i]->sched->num_jobs);
> +		struct drm_gpu_scheduler *sched = entity->rq_list[i]->sched;
> +
> +		if (entity->rq_list[i]->sched->ready &&
> +				!entity->rq_list[i]->sched->ready(sched)) {
> +			DRM_WARN("sched%s is not ready, skipping", sched->name);
> +
> +			continue;
> +		}
> +
> +		num_jobs = atomic_read(&sched->num_jobs);
>   		if (num_jobs < min_jobs) {
>   			min_jobs = num_jobs;
>   			rq = entity->rq_list[i];
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 63b997d..6b151f2 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -420,6 +420,9 @@ int drm_sched_job_init(struct drm_sched_job *job,
>   	struct drm_gpu_scheduler *sched;
>   
>   	drm_sched_entity_select_rq(entity);
> +	if (!entity->rq)
> +		return -ENOENT;
> +
>   	sched = entity->rq->sched;
>   
>   	job->sched = sched;
> @@ -606,7 +609,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>   		   unsigned hw_submission,
>   		   unsigned hang_limit,
>   		   long timeout,
> -		   const char *name)
> +		   const char *name,
> +		   bool (*ready)(struct drm_gpu_scheduler *sched))
>   {
>   	int i;
>   	sched->ops = ops;
> @@ -633,6 +637,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>   		return PTR_ERR(sched->thread);
>   	}
>   
> +	sched->ready = ready;
> +
>   	return 0;
>   }
>   EXPORT_SYMBOL(drm_sched_init);
> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index 80b641f..e06cc0d 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -212,7 +212,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>   			     &v3d_sched_ops,
>   			     hw_jobs_limit, job_hang_limit,
>   			     msecs_to_jiffies(hang_limit_ms),
> -			     "v3d_bin");
> +			     "v3d_bin",
> +				 NULL);
>   	if (ret) {
>   		dev_err(v3d->dev, "Failed to create bin scheduler: %d.", ret);
>   		return ret;
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 0684dcd..a6e18fe 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -283,12 +283,15 @@ struct drm_gpu_scheduler {
>   	spinlock_t			job_list_lock;
>   	int				hang_limit;
>   	atomic_t                        num_jobs;
> +
> +	bool 			(*ready)(struct drm_gpu_scheduler *sched);
>   };
>   
>   int drm_sched_init(struct drm_gpu_scheduler *sched,
>   		   const struct drm_sched_backend_ops *ops,
>   		   uint32_t hw_submission, unsigned hang_limit, long timeout,
> -		   const char *name);
> +		   const char *name,
> +		   bool (*ready)(struct drm_gpu_scheduler *sched));
>   void drm_sched_fini(struct drm_gpu_scheduler *sched);
>   int drm_sched_job_init(struct drm_sched_job *job,
>   		       struct drm_sched_entity *entity,

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: Refresh rq selection for job after ASIC reset
       [not found]     ` <1539888261-331-3-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
@ 2018-10-19  7:08       ` Koenig, Christian
       [not found]         ` <92efce5c-f9ad-847b-09b3-e7c6f7f13ff6-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Koenig, Christian @ 2018-10-19  7:08 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander

Am 18.10.18 um 20:44 schrieb Andrey Grodzovsky:
> A ring might become unusable after reset, if that the case
> drm_sched_entity_select_rq will choose another, working rq
> to run the job if there is one.
> Also, skip recovery of ring which is not ready.

Well that is not even remotely sufficient.

If we can't bring up a ring any more after a reset we would need to move 
all jobs which where previously scheduled on it and all of its entities 
to a different instance.

What you do here breaks dependencies between jobs and can result in 
unforeseen consequences including random memory writes.

As far as I can see that can't be done correctly with the current 
scheduler design.

Additional to that when we can't restart one instance of a ring we 
usually can't restart all others either. So the whole approach is rather 
pointless.

Regards,
Christian.

>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>   1 file changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index d11489e..3124ca1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3355,10 +3355,24 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	else
>   		r = amdgpu_device_reset(adev);
>   
> +	/*
> +	 * After reboot a ring might fail in which case this will
> +	 * move the job to different rq if possible
> +	 */
> +	if (job) {
> +		drm_sched_entity_select_rq(job->base.entity);
> +		if (job->base.entity->rq) {
> +			job->base.sched = job->base.entity->rq->sched;
> +		} else {
> +			job->base.sched = NULL;
> +			r = -ENOENT;
> +		}
> +	}
> +
>   	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   		struct amdgpu_ring *ring = adev->rings[i];
>   
> -		if (!ring || !ring->sched.thread)
> +		if (!ring || !ring->ready || !ring->sched.thread)
>   			continue;
>   
>   		/* only need recovery sched of the given job's ring

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/3] drm/sched: Add callback to mark if sched is ready to work.
       [not found] ` <1539888261-331-1-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
  2018-10-18 18:44   ` [PATCH 3/3] drm/amdgpu: Refresh rq selection for job after ASIC reset Andrey Grodzovsky
@ 2018-10-19  7:13   ` Michel Dänzer
       [not found]     ` <567b553a-eef7-9560-a789-f1c8bffe3206-otUistvHUpPR7s880joybQ@public.gmane.org>
  1 sibling, 1 reply; 10+ messages in thread
From: Michel Dänzer @ 2018-10-19  7:13 UTC (permalink / raw)
  To: Andrey Grodzovsky
  Cc: dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 2018-10-18 8:44 p.m., Andrey Grodzovsky wrote:
> Problem:
> A particular scheduler may become unsuable (underlying HW) after
> some event (e.g. GPU reset). If it's later chosen by
> the get free sched. policy a command will fail to be
> submitted.
> 
> Fix:
> Add a driver specific callback to report the sced. status so
> rq with bad sched. can be avoided in favor of working one or
> none in which case job init will fail.
> 
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> 
> [...]
>  
> @@ -450,7 +457,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>  
>  		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>  				   num_hw_submission, amdgpu_job_hang_limit,
> -				   timeout, ring->name);
> +				   timeout, ring->name, amdgpu_ring_ready);
>  		if (r) {
>  			DRM_ERROR("Failed to create scheduler on ring %s.\n",
>  				  ring->name);
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> index f8c5f1e..5094013 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> @@ -178,7 +178,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>  
>  	ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>  			     etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
> -			     msecs_to_jiffies(500), dev_name(gpu->dev));
> +			     msecs_to_jiffies(500), dev_name(gpu->dev),
> +				 NULL);

Indentation looks wrong for the NULL here...


> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
> index 3e22a54..320c77a 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> @@ -130,7 +130,16 @@ drm_sched_entity_get_free_sched(struct drm_sched_entity *entity)
>  	int i;
>  
>  	for (i = 0; i < entity->num_rq_list; ++i) {
> -		num_jobs = atomic_read(&entity->rq_list[i]->sched->num_jobs);
> +		struct drm_gpu_scheduler *sched = entity->rq_list[i]->sched;
> +
> +		if (entity->rq_list[i]->sched->ready &&
> +				!entity->rq_list[i]->sched->ready(sched)) {

... the second line of the if here...


> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index 80b641f..e06cc0d 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -212,7 +212,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>  			     &v3d_sched_ops,
>  			     hw_jobs_limit, job_hang_limit,
>  			     msecs_to_jiffies(hang_limit_ms),
> -			     "v3d_bin");
> +			     "v3d_bin",
> +				 NULL);

... the NULL here...


> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 0684dcd..a6e18fe 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -283,12 +283,15 @@ struct drm_gpu_scheduler {
>  	spinlock_t			job_list_lock;
>  	int				hang_limit;
>  	atomic_t                        num_jobs;
> +
> +	bool 			(*ready)(struct drm_gpu_scheduler *sched);
>  };

... and the new line here.

Which editor are you using?


-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/3] drm/sched: Add callback to mark if sched is ready to work.
       [not found]     ` <567b553a-eef7-9560-a789-f1c8bffe3206-otUistvHUpPR7s880joybQ@public.gmane.org>
@ 2018-10-19 14:45       ` Grodzovsky, Andrey
  0 siblings, 0 replies; 10+ messages in thread
From: Grodzovsky, Andrey @ 2018-10-19 14:45 UTC (permalink / raw)
  To: Michel Dänzer
  Cc: dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Eclipse

Andrey


On 10/19/2018 03:13 AM, Michel Dänzer wrote:
> ... and the new line here.
>
> Which editor are you using?

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: Refresh rq selection for job after ASIC reset
       [not found]         ` <92efce5c-f9ad-847b-09b3-e7c6f7f13ff6-5C7GfCeVMHo@public.gmane.org>
@ 2018-10-19 15:06           ` Grodzovsky, Andrey
       [not found]             ` <0928be66-d606-84b8-4fa8-399e5615f75c-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Grodzovsky, Andrey @ 2018-10-19 15:06 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander



On 10/19/2018 03:08 AM, Koenig, Christian wrote:
> Am 18.10.18 um 20:44 schrieb Andrey Grodzovsky:
>> A ring might become unusable after reset, if that the case
>> drm_sched_entity_select_rq will choose another, working rq
>> to run the job if there is one.
>> Also, skip recovery of ring which is not ready.
> Well that is not even remotely sufficient.
>
> If we can't bring up a ring any more after a reset we would need to move
> all jobs which where previously scheduled on it and all of its entities
> to a different instance.

That one should be easy to add inside amdgpu_device_gpu_recover in case
ring is dead, we just do the same for all the jobs in 
sched->ring_mirror_list of the dead ring
before doing recovery for them, no?
>
> What you do here breaks dependencies between jobs and can result in
> unforeseen consequences including random memory writes.

Can you explain this a bit more ? AFAIK any job dependent on this job 
will still wait for it's completion
before running regardless of did this job moved to a different ring. 
What am I missing ?
>
> As far as I can see that can't be done correctly with the current
> scheduler design.
>
> Additional to that when we can't restart one instance of a ring we
> usually can't restart all others either. So the whole approach is rather
> pointless.

 From my testing looks like we can, compute ring 0 is dead but IB tests 
pass on other compute rings.

Andrey

>
> Regards,
> Christian.
>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>>    1 file changed, 15 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index d11489e..3124ca1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3355,10 +3355,24 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    	else
>>    		r = amdgpu_device_reset(adev);
>>    
>> +	/*
>> +	 * After reboot a ring might fail in which case this will
>> +	 * move the job to different rq if possible
>> +	 */
>> +	if (job) {
>> +		drm_sched_entity_select_rq(job->base.entity);
>> +		if (job->base.entity->rq) {
>> +			job->base.sched = job->base.entity->rq->sched;
>> +		} else {
>> +			job->base.sched = NULL;
>> +			r = -ENOENT;
>> +		}
>> +	}
>> +
>>    	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>    		struct amdgpu_ring *ring = adev->rings[i];
>>    
>> -		if (!ring || !ring->sched.thread)
>> +		if (!ring || !ring->ready || !ring->sched.thread)
>>    			continue;
>>    
>>    		/* only need recovery sched of the given job's ring
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: Refresh rq selection for job after ASIC reset
       [not found]             ` <0928be66-d606-84b8-4fa8-399e5615f75c-5C7GfCeVMHo@public.gmane.org>
@ 2018-10-19 16:28               ` Christian König
  2018-10-19 16:50                 ` Grodzovsky, Andrey
  0 siblings, 1 reply; 10+ messages in thread
From: Christian König @ 2018-10-19 16:28 UTC (permalink / raw)
  To: Grodzovsky, Andrey, Koenig, Christian,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander

Am 19.10.18 um 17:06 schrieb Grodzovsky, Andrey:
>
> On 10/19/2018 03:08 AM, Koenig, Christian wrote:
>> Am 18.10.18 um 20:44 schrieb Andrey Grodzovsky:
>>> A ring might become unusable after reset, if that the case
>>> drm_sched_entity_select_rq will choose another, working rq
>>> to run the job if there is one.
>>> Also, skip recovery of ring which is not ready.
>> Well that is not even remotely sufficient.
>>
>> If we can't bring up a ring any more after a reset we would need to move
>> all jobs which where previously scheduled on it and all of its entities
>> to a different instance.
> That one should be easy to add inside amdgpu_device_gpu_recover in case
> ring is dead, we just do the same for all the jobs in
> sched->ring_mirror_list of the dead ring
> before doing recovery for them, no?

Correct, you need to execute all jobs from the ring_mirror_list as well 
as move all entities to other rqs.

But there are some problem with that see below.

>> What you do here breaks dependencies between jobs and can result in
>> unforeseen consequences including random memory writes.
> Can you explain this a bit more ? AFAIK any job dependent on this job
> will still wait for it's completion
> before running regardless of did this job moved to a different ring.
> What am I missing ?

The stuff dependent on the moving jobs should indeed work correctly.

But you don't necessary have space to execute the ring_mirror_list on 
another scheduler. And to that the jobs are prepared to run on a 
specific ring, e.g. UVD jobs are patches, VMIDs assigned etc etc...

So that most likely won't work correctly.

>> As far as I can see that can't be done correctly with the current
>> scheduler design.
>>
>> Additional to that when we can't restart one instance of a ring we
>> usually can't restart all others either. So the whole approach is rather
>> pointless.
>   From my testing looks like we can, compute ring 0 is dead but IB tests
> pass on other compute rings.

Interesting, but I would rather investigate why compute ring 0 is dead 
while other still work.

At least some compute rings should be handled by the same engine, so if 
one is dead all other should be dead as well.

Christian.

>
> Andrey
>
>> Regards,
>> Christian.
>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>>>     1 file changed, 15 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index d11489e..3124ca1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3355,10 +3355,24 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     	else
>>>     		r = amdgpu_device_reset(adev);
>>>     
>>> +	/*
>>> +	 * After reboot a ring might fail in which case this will
>>> +	 * move the job to different rq if possible
>>> +	 */
>>> +	if (job) {
>>> +		drm_sched_entity_select_rq(job->base.entity);
>>> +		if (job->base.entity->rq) {
>>> +			job->base.sched = job->base.entity->rq->sched;
>>> +		} else {
>>> +			job->base.sched = NULL;
>>> +			r = -ENOENT;
>>> +		}
>>> +	}
>>> +
>>>     	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>     		struct amdgpu_ring *ring = adev->rings[i];
>>>     
>>> -		if (!ring || !ring->sched.thread)
>>> +		if (!ring || !ring->ready || !ring->sched.thread)
>>>     			continue;
>>>     
>>>     		/* only need recovery sched of the given job's ring
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: Refresh rq selection for job after ASIC reset
  2018-10-19 16:28               ` Christian König
@ 2018-10-19 16:50                 ` Grodzovsky, Andrey
  0 siblings, 0 replies; 10+ messages in thread
From: Grodzovsky, Andrey @ 2018-10-19 16:50 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx, dri-devel; +Cc: Deucher, Alexander


[-- Attachment #1.1: Type: text/plain, Size: 285 bytes --]

That my next step.

Andrey

On 10/19/2018 12:28 PM, Christian König wrote:
  From my testing looks like we can, compute ring 0 is dead but IB tests
pass on other compute rings.

Interesting, but I would rather investigate why compute ring 0 is dead while other still work.


[-- Attachment #1.2: Type: text/html, Size: 687 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2018-10-19 16:50 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-18 18:44 [PATCH 1/3] drm/sched: Add callback to mark if sched is ready to work Andrey Grodzovsky
2018-10-18 18:44 ` [PATCH 2/3] drm/sched: Expose drm_sched_entity_select_rq to use in drivers Andrey Grodzovsky
2018-10-19  7:03 ` [PATCH 1/3] drm/sched: Add callback to mark if sched is ready to work Koenig, Christian
     [not found] ` <1539888261-331-1-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
2018-10-18 18:44   ` [PATCH 3/3] drm/amdgpu: Refresh rq selection for job after ASIC reset Andrey Grodzovsky
     [not found]     ` <1539888261-331-3-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
2018-10-19  7:08       ` Koenig, Christian
     [not found]         ` <92efce5c-f9ad-847b-09b3-e7c6f7f13ff6-5C7GfCeVMHo@public.gmane.org>
2018-10-19 15:06           ` Grodzovsky, Andrey
     [not found]             ` <0928be66-d606-84b8-4fa8-399e5615f75c-5C7GfCeVMHo@public.gmane.org>
2018-10-19 16:28               ` Christian König
2018-10-19 16:50                 ` Grodzovsky, Andrey
2018-10-19  7:13   ` [PATCH 1/3] drm/sched: Add callback to mark if sched is ready to work Michel Dänzer
     [not found]     ` <567b553a-eef7-9560-a789-f1c8bffe3206-otUistvHUpPR7s880joybQ@public.gmane.org>
2018-10-19 14:45       ` Grodzovsky, Andrey

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.