All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-24 10:58 ` S, Shirish
  0 siblings, 0 replies; 38+ messages in thread
From: S, Shirish @ 2019-10-24 10:58 UTC (permalink / raw)
  To: Deucher, Alexander, Koenig, Christian, Grodzovsky, Andrey
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, S, Shirish

[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
 	bool                            in_gpu_reset;
 	enum pp_mp1_state               mp1_state;
 	struct mutex  lock_reset;
+	struct mutex  lock_ib_sched;
 	struct amdgpu_doorbell_index doorbell_index;
 
 	int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	mutex_init(&adev->virt.vf_errors.lock);
 	hash_init(adev->mn_hash);
 	mutex_init(&adev->lock_reset);
+	mutex_init(&adev->lock_ib_sched);
 	mutex_init(&adev->virt.dpm_mutex);
 	mutex_init(&adev->psp.mutex);
 
@@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 				if (r)
 					return r;
 
+				mutex_lock(&tmp_adev->lock_ib_sched);
 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
+				mutex_unlock(&tmp_adev->lock_ib_sched);
 				if (r)
 					goto out;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
 	if (finished->error < 0) {
 		DRM_INFO("Skip scheduling IBs!\n");
 	} else {
+		mutex_lock(&ring->adev->lock_ib_sched);
 		r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
 				       &fence);
+		mutex_unlock(&ring->adev->lock_ib_sched);
 		if (r)
 			DRM_ERROR("Error scheduling IBs (%d)\n", r);
 	}
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-24 10:58 ` S, Shirish
  0 siblings, 0 replies; 38+ messages in thread
From: S, Shirish @ 2019-10-24 10:58 UTC (permalink / raw)
  To: Deucher, Alexander, Koenig, Christian, Grodzovsky, Andrey
  Cc: amd-gfx, S, Shirish

[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
 	bool                            in_gpu_reset;
 	enum pp_mp1_state               mp1_state;
 	struct mutex  lock_reset;
+	struct mutex  lock_ib_sched;
 	struct amdgpu_doorbell_index doorbell_index;
 
 	int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	mutex_init(&adev->virt.vf_errors.lock);
 	hash_init(adev->mn_hash);
 	mutex_init(&adev->lock_reset);
+	mutex_init(&adev->lock_ib_sched);
 	mutex_init(&adev->virt.dpm_mutex);
 	mutex_init(&adev->psp.mutex);
 
@@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 				if (r)
 					return r;
 
+				mutex_lock(&tmp_adev->lock_ib_sched);
 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
+				mutex_unlock(&tmp_adev->lock_ib_sched);
 				if (r)
 					goto out;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
 	if (finished->error < 0) {
 		DRM_INFO("Skip scheduling IBs!\n");
 	} else {
+		mutex_lock(&ring->adev->lock_ib_sched);
 		r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
 				       &fence);
+		mutex_unlock(&ring->adev->lock_ib_sched);
 		if (r)
 			DRM_ERROR("Error scheduling IBs (%d)\n", r);
 	}
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-24 11:01     ` Christian König
  0 siblings, 0 replies; 38+ messages in thread
From: Christian König @ 2019-10-24 11:01 UTC (permalink / raw)
  To: S, Shirish, Deucher, Alexander, Koenig, Christian, Grodzovsky, Andrey
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 24.10.19 um 12:58 schrieb S, Shirish:
> [Why]
> Upon GPU reset, kernel cleans up already submitted jobs
> via drm_sched_cleanup_jobs.
> This schedules ib's via drm_sched_main()->run_job, leading to
> race condition of rings being ready or not, since during reset
> rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Please sync up with Andrey how this was able to happen.

Regards,
Christian.

>
> [How]
> make GPU reset's amdgpu_device_ip_resume_phase2() &
> amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.
>
> Signed-off-by: Shirish S <shirish.s@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
>   3 files changed, 6 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index f4d9041..7b07a47b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -973,6 +973,7 @@ struct amdgpu_device {
>   	bool                            in_gpu_reset;
>   	enum pp_mp1_state               mp1_state;
>   	struct mutex  lock_reset;
> +	struct mutex  lock_ib_sched;
>   	struct amdgpu_doorbell_index doorbell_index;
>   
>   	int asic_reset_res;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 676cad1..63cad74 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	mutex_init(&adev->virt.vf_errors.lock);
>   	hash_init(adev->mn_hash);
>   	mutex_init(&adev->lock_reset);
> +	mutex_init(&adev->lock_ib_sched);
>   	mutex_init(&adev->virt.dpm_mutex);
>   	mutex_init(&adev->psp.mutex);
>   
> @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   				if (r)
>   					return r;
>   
> +				mutex_lock(&tmp_adev->lock_ib_sched);
>   				r = amdgpu_device_ip_resume_phase2(tmp_adev);
> +				mutex_unlock(&tmp_adev->lock_ib_sched);
>   				if (r)
>   					goto out;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index e1bad99..cd6082d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
>   	if (finished->error < 0) {
>   		DRM_INFO("Skip scheduling IBs!\n");
>   	} else {
> +		mutex_lock(&ring->adev->lock_ib_sched);
>   		r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
>   				       &fence);
> +		mutex_unlock(&ring->adev->lock_ib_sched);
>   		if (r)
>   			DRM_ERROR("Error scheduling IBs (%d)\n", r);
>   	}

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-24 11:01     ` Christian König
  0 siblings, 0 replies; 38+ messages in thread
From: Christian König @ 2019-10-24 11:01 UTC (permalink / raw)
  To: S, Shirish, Deucher, Alexander, Koenig, Christian, Grodzovsky, Andrey
  Cc: amd-gfx

Am 24.10.19 um 12:58 schrieb S, Shirish:
> [Why]
> Upon GPU reset, kernel cleans up already submitted jobs
> via drm_sched_cleanup_jobs.
> This schedules ib's via drm_sched_main()->run_job, leading to
> race condition of rings being ready or not, since during reset
> rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Please sync up with Andrey how this was able to happen.

Regards,
Christian.

>
> [How]
> make GPU reset's amdgpu_device_ip_resume_phase2() &
> amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.
>
> Signed-off-by: Shirish S <shirish.s@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
>   3 files changed, 6 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index f4d9041..7b07a47b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -973,6 +973,7 @@ struct amdgpu_device {
>   	bool                            in_gpu_reset;
>   	enum pp_mp1_state               mp1_state;
>   	struct mutex  lock_reset;
> +	struct mutex  lock_ib_sched;
>   	struct amdgpu_doorbell_index doorbell_index;
>   
>   	int asic_reset_res;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 676cad1..63cad74 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	mutex_init(&adev->virt.vf_errors.lock);
>   	hash_init(adev->mn_hash);
>   	mutex_init(&adev->lock_reset);
> +	mutex_init(&adev->lock_ib_sched);
>   	mutex_init(&adev->virt.dpm_mutex);
>   	mutex_init(&adev->psp.mutex);
>   
> @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   				if (r)
>   					return r;
>   
> +				mutex_lock(&tmp_adev->lock_ib_sched);
>   				r = amdgpu_device_ip_resume_phase2(tmp_adev);
> +				mutex_unlock(&tmp_adev->lock_ib_sched);
>   				if (r)
>   					goto out;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index e1bad99..cd6082d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
>   	if (finished->error < 0) {
>   		DRM_INFO("Skip scheduling IBs!\n");
>   	} else {
> +		mutex_lock(&ring->adev->lock_ib_sched);
>   		r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
>   				       &fence);
> +		mutex_unlock(&ring->adev->lock_ib_sched);
>   		if (r)
>   			DRM_ERROR("Error scheduling IBs (%d)\n", r);
>   	}

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-24 15:06         ` Grodzovsky, Andrey
  0 siblings, 0 replies; 38+ messages in thread
From: Grodzovsky, Andrey @ 2019-10-24 15:06 UTC (permalink / raw)
  To: Koenig, Christian, S, Shirish, Deucher, Alexander
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 3624 bytes --]


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }


[-- Attachment #1.2: Type: text/html, Size: 8224 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-24 15:06         ` Grodzovsky, Andrey
  0 siblings, 0 replies; 38+ messages in thread
From: Grodzovsky, Andrey @ 2019-10-24 15:06 UTC (permalink / raw)
  To: Koenig, Christian, S, Shirish, Deucher, Alexander; +Cc: amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 3624 bytes --]


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }


[-- Attachment #1.2: Type: text/html, Size: 8224 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-24 16:30             ` Christian König
  0 siblings, 0 replies; 38+ messages in thread
From: Christian König @ 2019-10-24 16:30 UTC (permalink / raw)
  To: Grodzovsky, Andrey, Koenig, Christian, S, Shirish, Deucher, Alexander
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 4504 bytes --]

Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:
>
>
> On 10/24/19 7:01 AM, Christian König wrote:
>> Am 24.10.19 um 12:58 schrieb S, Shirish:
>>> [Why]
>>> Upon GPU reset, kernel cleans up already submitted jobs
>>> via drm_sched_cleanup_jobs.
>>> This schedules ib's via drm_sched_main()->run_job, leading to
>>> race condition of rings being ready or not, since during reset
>>> rings may be suspended.
>>
>> NAK, exactly that's what should not happen.
>>
>> The scheduler should be suspend while a GPU reset is in progress.
>>
>> So you are running into a completely different race here.
>>
>> Please sync up with Andrey how this was able to happen.
>>
>> Regards,
>> Christian.
>
>
> Shirish - Christian makes a good point - note that in 
> amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler 
> threads is called way before we suspend the HW in 
> amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA 
> suspension is happening and where the HW ring marked as not ready - 
> please provide call stack for when you hit [drm:amdgpu_job_run] 
> *ERROR* Error scheduling IBs (-22) to identify the code path which 
> tried to submit the SDMA IB
>

Well the most likely cause of this is that the hardware failed to resume 
after the reset.

Christian.

> Andrey
>
>
>>
>>>
>>> [How]
>>> make GPU reset's amdgpu_device_ip_resume_phase2() &
>>> amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.
>>>
>>> Signed-off-by: Shirish S <shirish.s-5C7GfCeVMHo@public.gmane.org>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
>>>   3 files changed, 6 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index f4d9041..7b07a47b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -973,6 +973,7 @@ struct amdgpu_device {
>>>       bool                            in_gpu_reset;
>>>       enum pp_mp1_state               mp1_state;
>>>       struct mutex  lock_reset;
>>> +    struct mutex  lock_ib_sched;
>>>       struct amdgpu_doorbell_index doorbell_index;
>>>         int asic_reset_res;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 676cad1..63cad74 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device 
>>> *adev,
>>>       mutex_init(&adev->virt.vf_errors.lock);
>>>       hash_init(adev->mn_hash);
>>>       mutex_init(&adev->lock_reset);
>>> +    mutex_init(&adev->lock_ib_sched);
>>>       mutex_init(&adev->virt.dpm_mutex);
>>>       mutex_init(&adev->psp.mutex);
>>>   @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct 
>>> amdgpu_hive_info *hive,
>>>                   if (r)
>>>                       return r;
>>>   + mutex_lock(&tmp_adev->lock_ib_sched);
>>>                   r = amdgpu_device_ip_resume_phase2(tmp_adev);
>>> + mutex_unlock(&tmp_adev->lock_ib_sched);
>>>                   if (r)
>>>                       goto out;
>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index e1bad99..cd6082d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct 
>>> drm_sched_job *sched_job)
>>>       if (finished->error < 0) {
>>>           DRM_INFO("Skip scheduling IBs!\n");
>>>       } else {
>>> +        mutex_lock(&ring->adev->lock_ib_sched);
>>>           r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
>>>                          &fence);
>>> +        mutex_unlock(&ring->adev->lock_ib_sched);
>>>           if (r)
>>>               DRM_ERROR("Error scheduling IBs (%d)\n", r);
>>>       }
>>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[-- Attachment #1.2: Type: text/html, Size: 9717 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-24 16:30             ` Christian König
  0 siblings, 0 replies; 38+ messages in thread
From: Christian König @ 2019-10-24 16:30 UTC (permalink / raw)
  To: Grodzovsky, Andrey, Koenig, Christian, S, Shirish, Deucher, Alexander
  Cc: amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 4455 bytes --]

Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:
>
>
> On 10/24/19 7:01 AM, Christian König wrote:
>> Am 24.10.19 um 12:58 schrieb S, Shirish:
>>> [Why]
>>> Upon GPU reset, kernel cleans up already submitted jobs
>>> via drm_sched_cleanup_jobs.
>>> This schedules ib's via drm_sched_main()->run_job, leading to
>>> race condition of rings being ready or not, since during reset
>>> rings may be suspended.
>>
>> NAK, exactly that's what should not happen.
>>
>> The scheduler should be suspend while a GPU reset is in progress.
>>
>> So you are running into a completely different race here.
>>
>> Please sync up with Andrey how this was able to happen.
>>
>> Regards,
>> Christian.
>
>
> Shirish - Christian makes a good point - note that in 
> amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler 
> threads is called way before we suspend the HW in 
> amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA 
> suspension is happening and where the HW ring marked as not ready - 
> please provide call stack for when you hit [drm:amdgpu_job_run] 
> *ERROR* Error scheduling IBs (-22) to identify the code path which 
> tried to submit the SDMA IB
>

Well the most likely cause of this is that the hardware failed to resume 
after the reset.

Christian.

> Andrey
>
>
>>
>>>
>>> [How]
>>> make GPU reset's amdgpu_device_ip_resume_phase2() &
>>> amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.
>>>
>>> Signed-off-by: Shirish S <shirish.s@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
>>>   3 files changed, 6 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index f4d9041..7b07a47b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -973,6 +973,7 @@ struct amdgpu_device {
>>>       bool                            in_gpu_reset;
>>>       enum pp_mp1_state               mp1_state;
>>>       struct mutex  lock_reset;
>>> +    struct mutex  lock_ib_sched;
>>>       struct amdgpu_doorbell_index doorbell_index;
>>>         int asic_reset_res;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 676cad1..63cad74 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device 
>>> *adev,
>>>       mutex_init(&adev->virt.vf_errors.lock);
>>>       hash_init(adev->mn_hash);
>>>       mutex_init(&adev->lock_reset);
>>> +    mutex_init(&adev->lock_ib_sched);
>>>       mutex_init(&adev->virt.dpm_mutex);
>>>       mutex_init(&adev->psp.mutex);
>>>   @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct 
>>> amdgpu_hive_info *hive,
>>>                   if (r)
>>>                       return r;
>>>   + mutex_lock(&tmp_adev->lock_ib_sched);
>>>                   r = amdgpu_device_ip_resume_phase2(tmp_adev);
>>> + mutex_unlock(&tmp_adev->lock_ib_sched);
>>>                   if (r)
>>>                       goto out;
>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index e1bad99..cd6082d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct 
>>> drm_sched_job *sched_job)
>>>       if (finished->error < 0) {
>>>           DRM_INFO("Skip scheduling IBs!\n");
>>>       } else {
>>> +        mutex_lock(&ring->adev->lock_ib_sched);
>>>           r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
>>>                          &fence);
>>> +        mutex_unlock(&ring->adev->lock_ib_sched);
>>>           if (r)
>>>               DRM_ERROR("Error scheduling IBs (%d)\n", r);
>>>       }
>>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[-- Attachment #1.2: Type: text/html, Size: 9517 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25  8:50                 ` S, Shirish
  0 siblings, 0 replies; 38+ messages in thread
From: S, Shirish @ 2019-10-25  8:50 UTC (permalink / raw)
  To: Koenig, Christian, Grodzovsky, Andrey, S, Shirish, Deucher, Alexander
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 6129 bytes --]

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx





[-- Attachment #1.2: Type: text/html, Size: 13440 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25  8:50                 ` S, Shirish
  0 siblings, 0 replies; 38+ messages in thread
From: S, Shirish @ 2019-10-25  8:50 UTC (permalink / raw)
  To: Koenig, Christian, Grodzovsky, Andrey, S, Shirish, Deucher, Alexander
  Cc: amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 6129 bytes --]

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx





[-- Attachment #1.2: Type: text/html, Size: 13440 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25  8:53                     ` Koenig, Christian
  0 siblings, 0 replies; 38+ messages in thread
From: Koenig, Christian @ 2019-10-25  8:53 UTC (permalink / raw)
  To: S, Shirish, Grodzovsky, Andrey, Deucher, Alexander
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 6685 bytes --]

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[-- Attachment #1.2: Type: text/html, Size: 14297 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25  8:53                     ` Koenig, Christian
  0 siblings, 0 replies; 38+ messages in thread
From: Koenig, Christian @ 2019-10-25  8:53 UTC (permalink / raw)
  To: S, Shirish, Grodzovsky, Andrey, Deucher, Alexander; +Cc: amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 6685 bytes --]

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[-- Attachment #1.2: Type: text/html, Size: 14297 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25  9:22                         ` S, Shirish
  0 siblings, 0 replies; 38+ messages in thread
From: S, Shirish @ 2019-10-25  9:22 UTC (permalink / raw)
  To: Koenig, Christian, S, Shirish, Grodzovsky, Andrey, Deucher, Alexander
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 7536 bytes --]


On 10/25/2019 2:23 PM, Koenig, Christian wrote:

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

I dont see the scheduler being resumed when the ib is scheduled, its done way after the hardware is ready in reset code path.

Below are the logs:

amdgpu 0000:03:00.0: GPU reset begin!
amdgpu_device_gpu_recover calling drm_sched_stop             <==
...
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...
amdgpu_device_ip_resume_phase2 resumed sdma_v4_0
amdgpu_device_ip_resume_phase2 resumed powerplay
amdgpu_device_ip_resume_phase2 resumed dm
...
[drm] recover vram bo from shadow done
amdgpu_device_gpu_recover calling  drm_sched_start         <==
...

As mentioned in the call trace, drm_sched_main() is responsible for this job_run which seems to be called during cleanup.

Regards,

Shirish S

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx






[-- Attachment #1.2: Type: text/html, Size: 15769 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25  9:22                         ` S, Shirish
  0 siblings, 0 replies; 38+ messages in thread
From: S, Shirish @ 2019-10-25  9:22 UTC (permalink / raw)
  To: Koenig, Christian, S, Shirish, Grodzovsky, Andrey, Deucher, Alexander
  Cc: amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 7536 bytes --]


On 10/25/2019 2:23 PM, Koenig, Christian wrote:

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

I dont see the scheduler being resumed when the ib is scheduled, its done way after the hardware is ready in reset code path.

Below are the logs:

amdgpu 0000:03:00.0: GPU reset begin!
amdgpu_device_gpu_recover calling drm_sched_stop             <==
...
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...
amdgpu_device_ip_resume_phase2 resumed sdma_v4_0
amdgpu_device_ip_resume_phase2 resumed powerplay
amdgpu_device_ip_resume_phase2 resumed dm
...
[drm] recover vram bo from shadow done
amdgpu_device_gpu_recover calling  drm_sched_start         <==
...

As mentioned in the call trace, drm_sched_main() is responsible for this job_run which seems to be called during cleanup.

Regards,

Shirish S

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx






[-- Attachment #1.2: Type: text/html, Size: 15769 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25  9:26                             ` Koenig, Christian
  0 siblings, 0 replies; 38+ messages in thread
From: Koenig, Christian @ 2019-10-25  9:26 UTC (permalink / raw)
  To: S, Shirish, Grodzovsky, Andrey, Deucher, Alexander
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 7966 bytes --]

Am 25.10.19 um 11:22 schrieb S, Shirish:


On 10/25/2019 2:23 PM, Koenig, Christian wrote:

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

I dont see the scheduler being resumed when the ib is scheduled, its done way after the hardware is ready in reset code path.

Below are the logs:

amdgpu 0000:03:00.0: GPU reset begin!
amdgpu_device_gpu_recover calling drm_sched_stop             <==
...
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...
amdgpu_device_ip_resume_phase2 resumed sdma_v4_0
amdgpu_device_ip_resume_phase2 resumed powerplay
amdgpu_device_ip_resume_phase2 resumed dm
...
[drm] recover vram bo from shadow done
amdgpu_device_gpu_recover calling  drm_sched_start         <==
...

As mentioned in the call trace, drm_sched_main() is responsible for this job_run which seems to be called during cleanup.

Then the scheduler isn't stopped for some reason and we need to investigate why.

We used to have another kthread_park()/unpark() in drm_sched_entity_fini(), maybe an application is crashing while we are trying to reset the GPU?

Would be rather unlikely, especially that would be rather hard to reproduce but currently my best bet what's going wrong here.

Regards,
Christian.


Regards,

Shirish S

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx




[-- Attachment #1.2: Type: text/html, Size: 16564 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25  9:26                             ` Koenig, Christian
  0 siblings, 0 replies; 38+ messages in thread
From: Koenig, Christian @ 2019-10-25  9:26 UTC (permalink / raw)
  To: S, Shirish, Grodzovsky, Andrey, Deucher, Alexander; +Cc: amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 7966 bytes --]

Am 25.10.19 um 11:22 schrieb S, Shirish:


On 10/25/2019 2:23 PM, Koenig, Christian wrote:

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

I dont see the scheduler being resumed when the ib is scheduled, its done way after the hardware is ready in reset code path.

Below are the logs:

amdgpu 0000:03:00.0: GPU reset begin!
amdgpu_device_gpu_recover calling drm_sched_stop             <==
...
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...
amdgpu_device_ip_resume_phase2 resumed sdma_v4_0
amdgpu_device_ip_resume_phase2 resumed powerplay
amdgpu_device_ip_resume_phase2 resumed dm
...
[drm] recover vram bo from shadow done
amdgpu_device_gpu_recover calling  drm_sched_start         <==
...

As mentioned in the call trace, drm_sched_main() is responsible for this job_run which seems to be called during cleanup.

Then the scheduler isn't stopped for some reason and we need to investigate why.

We used to have another kthread_park()/unpark() in drm_sched_entity_fini(), maybe an application is crashing while we are trying to reset the GPU?

Would be rather unlikely, especially that would be rather hard to reproduce but currently my best bet what's going wrong here.

Regards,
Christian.


Regards,

Shirish S

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx




[-- Attachment #1.2: Type: text/html, Size: 16564 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25 10:08                                 ` S, Shirish
  0 siblings, 0 replies; 38+ messages in thread
From: S, Shirish @ 2019-10-25 10:08 UTC (permalink / raw)
  To: Koenig, Christian, S, Shirish, Grodzovsky, Andrey, Deucher, Alexander
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 8725 bytes --]


On 10/25/2019 2:56 PM, Koenig, Christian wrote:
Am 25.10.19 um 11:22 schrieb S, Shirish:


On 10/25/2019 2:23 PM, Koenig, Christian wrote:

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

I dont see the scheduler being resumed when the ib is scheduled, its done way after the hardware is ready in reset code path.

Below are the logs:

amdgpu 0000:03:00.0: GPU reset begin!
amdgpu_device_gpu_recover calling drm_sched_stop             <==
...
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...
amdgpu_device_ip_resume_phase2 resumed sdma_v4_0
amdgpu_device_ip_resume_phase2 resumed powerplay
amdgpu_device_ip_resume_phase2 resumed dm
...
[drm] recover vram bo from shadow done
amdgpu_device_gpu_recover calling  drm_sched_start         <==
...

As mentioned in the call trace, drm_sched_main() is responsible for this job_run which seems to be called during cleanup.

Then the scheduler isn't stopped for some reason and we need to investigate why.


The drm_sched_stop() as i mentioned only parks the thread and cancels work and nothing else, not sure why you think it hasn't stopped or done what it is supposed to do.

Since it works 3/5 times.

We used to have another kthread_park()/unpark() in drm_sched_entity_fini(), maybe an application is crashing while we are trying to reset the GPU?

Would be rather unlikely, especially that would be rather hard to reproduce but currently my best bet what's going wrong here.

Its sometimes triggered from drm_sched_entity_fini(), as i can see in prints but not always.

I believe application crashing while GPU resets is anticipated, depending upon workloads and state of gfx renderer when reset has happened.

Since reset is something that is not a usual/routine/regular event, such anomalies are to be expected when it happens,

so we need to have failsafe methods like this patch and may be some more based on system behavior upon reset.

Regards,

Shirish S

Regards,
Christian.


Regards,

Shirish S

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx







[-- Attachment #1.2: Type: text/html, Size: 17862 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25 10:08                                 ` S, Shirish
  0 siblings, 0 replies; 38+ messages in thread
From: S, Shirish @ 2019-10-25 10:08 UTC (permalink / raw)
  To: Koenig, Christian, S, Shirish, Grodzovsky, Andrey, Deucher, Alexander
  Cc: amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 8725 bytes --]


On 10/25/2019 2:56 PM, Koenig, Christian wrote:
Am 25.10.19 um 11:22 schrieb S, Shirish:


On 10/25/2019 2:23 PM, Koenig, Christian wrote:

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

I dont see the scheduler being resumed when the ib is scheduled, its done way after the hardware is ready in reset code path.

Below are the logs:

amdgpu 0000:03:00.0: GPU reset begin!
amdgpu_device_gpu_recover calling drm_sched_stop             <==
...
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...
amdgpu_device_ip_resume_phase2 resumed sdma_v4_0
amdgpu_device_ip_resume_phase2 resumed powerplay
amdgpu_device_ip_resume_phase2 resumed dm
...
[drm] recover vram bo from shadow done
amdgpu_device_gpu_recover calling  drm_sched_start         <==
...

As mentioned in the call trace, drm_sched_main() is responsible for this job_run which seems to be called during cleanup.

Then the scheduler isn't stopped for some reason and we need to investigate why.


The drm_sched_stop() as i mentioned only parks the thread and cancels work and nothing else, not sure why you think it hasn't stopped or done what it is supposed to do.

Since it works 3/5 times.

We used to have another kthread_park()/unpark() in drm_sched_entity_fini(), maybe an application is crashing while we are trying to reset the GPU?

Would be rather unlikely, especially that would be rather hard to reproduce but currently my best bet what's going wrong here.

Its sometimes triggered from drm_sched_entity_fini(), as i can see in prints but not always.

I believe application crashing while GPU resets is anticipated, depending upon workloads and state of gfx renderer when reset has happened.

Since reset is something that is not a usual/routine/regular event, such anomalies are to be expected when it happens,

so we need to have failsafe methods like this patch and may be some more based on system behavior upon reset.

Regards,

Shirish S

Regards,
Christian.


Regards,

Shirish S

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx







[-- Attachment #1.2: Type: text/html, Size: 17862 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25 10:32                                     ` Koenig, Christian
  0 siblings, 0 replies; 38+ messages in thread
From: Koenig, Christian @ 2019-10-25 10:32 UTC (permalink / raw)
  To: S, Shirish, Grodzovsky, Andrey, Deucher, Alexander
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 9292 bytes --]

Am 25.10.19 um 12:08 schrieb S, Shirish:


On 10/25/2019 2:56 PM, Koenig, Christian wrote:
Am 25.10.19 um 11:22 schrieb S, Shirish:


On 10/25/2019 2:23 PM, Koenig, Christian wrote:

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

I dont see the scheduler being resumed when the ib is scheduled, its done way after the hardware is ready in reset code path.

Below are the logs:

amdgpu 0000:03:00.0: GPU reset begin!
amdgpu_device_gpu_recover calling drm_sched_stop             <==
...
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...
amdgpu_device_ip_resume_phase2 resumed sdma_v4_0
amdgpu_device_ip_resume_phase2 resumed powerplay
amdgpu_device_ip_resume_phase2 resumed dm
...
[drm] recover vram bo from shadow done
amdgpu_device_gpu_recover calling  drm_sched_start         <==
...

As mentioned in the call trace, drm_sched_main() is responsible for this job_run which seems to be called during cleanup.

Then the scheduler isn't stopped for some reason and we need to investigate why.


The drm_sched_stop() as i mentioned only parks the thread

That should be sufficient for the thread to be halted and not submit more jobs to the hardware.


and cancels work and nothing else, not sure why you think it hasn't stopped or done what it is supposed to do.

Since it works 3/5 times.

We used to have another kthread_park()/unpark() in drm_sched_entity_fini(), maybe an application is crashing while we are trying to reset the GPU?

Would be rather unlikely, especially that would be rather hard to reproduce but currently my best bet what's going wrong here.

Its sometimes triggered from drm_sched_entity_fini(), as i can see in prints but not always.

I believe application crashing while GPU resets is anticipated, depending upon workloads and state of gfx renderer when reset has happened.

Since reset is something that is not a usual/routine/regular event, such anomalies are to be expected when it happens,

so we need to have failsafe methods like this patch and may be some more based on system behavior upon reset.

Sorry but your patch is complete nonsense from the design point of view.

It is absolutely mandatory that the scheduler is stopped and the thread correctly parked for the GPU reset to work properly.

See you not only need to prevent running new jobs, but also job preparation (e.g. grabbing VMIDs) or otherwise you will immediately run into the next GPU reset after the first one finished.

Regards,
Christian.


Regards,

Shirish S

Regards,
Christian.


Regards,

Shirish S

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx





[-- Attachment #1.2: Type: text/html, Size: 18868 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25 10:32                                     ` Koenig, Christian
  0 siblings, 0 replies; 38+ messages in thread
From: Koenig, Christian @ 2019-10-25 10:32 UTC (permalink / raw)
  To: S, Shirish, Grodzovsky, Andrey, Deucher, Alexander; +Cc: amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 9292 bytes --]

Am 25.10.19 um 12:08 schrieb S, Shirish:


On 10/25/2019 2:56 PM, Koenig, Christian wrote:
Am 25.10.19 um 11:22 schrieb S, Shirish:


On 10/25/2019 2:23 PM, Koenig, Christian wrote:

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

I dont see the scheduler being resumed when the ib is scheduled, its done way after the hardware is ready in reset code path.

Below are the logs:

amdgpu 0000:03:00.0: GPU reset begin!
amdgpu_device_gpu_recover calling drm_sched_stop             <==
...
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...
amdgpu_device_ip_resume_phase2 resumed sdma_v4_0
amdgpu_device_ip_resume_phase2 resumed powerplay
amdgpu_device_ip_resume_phase2 resumed dm
...
[drm] recover vram bo from shadow done
amdgpu_device_gpu_recover calling  drm_sched_start         <==
...

As mentioned in the call trace, drm_sched_main() is responsible for this job_run which seems to be called during cleanup.

Then the scheduler isn't stopped for some reason and we need to investigate why.


The drm_sched_stop() as i mentioned only parks the thread

That should be sufficient for the thread to be halted and not submit more jobs to the hardware.


and cancels work and nothing else, not sure why you think it hasn't stopped or done what it is supposed to do.

Since it works 3/5 times.

We used to have another kthread_park()/unpark() in drm_sched_entity_fini(), maybe an application is crashing while we are trying to reset the GPU?

Would be rather unlikely, especially that would be rather hard to reproduce but currently my best bet what's going wrong here.

Its sometimes triggered from drm_sched_entity_fini(), as i can see in prints but not always.

I believe application crashing while GPU resets is anticipated, depending upon workloads and state of gfx renderer when reset has happened.

Since reset is something that is not a usual/routine/regular event, such anomalies are to be expected when it happens,

so we need to have failsafe methods like this patch and may be some more based on system behavior upon reset.

Sorry but your patch is complete nonsense from the design point of view.

It is absolutely mandatory that the scheduler is stopped and the thread correctly parked for the GPU reset to work properly.

See you not only need to prevent running new jobs, but also job preparation (e.g. grabbing VMIDs) or otherwise you will immediately run into the next GPU reset after the first one finished.

Regards,
Christian.


Regards,

Shirish S

Regards,
Christian.


Regards,

Shirish S

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx





[-- Attachment #1.2: Type: text/html, Size: 18868 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25 15:35                                 ` Grodzovsky, Andrey
  0 siblings, 0 replies; 38+ messages in thread
From: Grodzovsky, Andrey @ 2019-10-25 15:35 UTC (permalink / raw)
  To: Koenig, Christian, S, Shirish, Deucher, Alexander
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 8561 bytes --]


On 10/25/19 5:26 AM, Koenig, Christian wrote:
Am 25.10.19 um 11:22 schrieb S, Shirish:


On 10/25/2019 2:23 PM, Koenig, Christian wrote:

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

I dont see the scheduler being resumed when the ib is scheduled, its done way after the hardware is ready in reset code path.

Below are the logs:

amdgpu 0000:03:00.0: GPU reset begin!
amdgpu_device_gpu_recover calling drm_sched_stop             <==
...
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...
amdgpu_device_ip_resume_phase2 resumed sdma_v4_0
amdgpu_device_ip_resume_phase2 resumed powerplay
amdgpu_device_ip_resume_phase2 resumed dm
...
[drm] recover vram bo from shadow done
amdgpu_device_gpu_recover calling  drm_sched_start         <==
...

As mentioned in the call trace, drm_sched_main() is responsible for this job_run which seems to be called during cleanup.

Then the scheduler isn't stopped for some reason and we need to investigate why.

We used to have another kthread_park()/unpark() in drm_sched_entity_fini(), maybe an application is crashing while we are trying to reset the GPU?


We still have it and isn't doing kthread_park()/unpark() from drm_sched_entity_fini while GPU reset in progress defeats all the purpose of drm_sched_stop->kthread_park ? If drm_sched_entity_fini-> kthread_unpark happens AFTER drm_sched_stop->kthread_park nothing prevents from another (third) thread keep submitting job to HW which will be picked up by the unparked scheduler thread try to submit to HW but fail because the HW ring is deactivated.

If so maybe we should serialize calls to kthread_park/unpark(sched->thread) ?

Andrey


Would be rather unlikely, especially that would be rather hard to reproduce but currently my best bet what's going wrong here.

Regards,
Christian.


Regards,

Shirish S

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx




[-- Attachment #1.2: Type: text/html, Size: 17573 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25 15:35                                 ` Grodzovsky, Andrey
  0 siblings, 0 replies; 38+ messages in thread
From: Grodzovsky, Andrey @ 2019-10-25 15:35 UTC (permalink / raw)
  To: Koenig, Christian, S, Shirish, Deucher, Alexander; +Cc: amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 8561 bytes --]


On 10/25/19 5:26 AM, Koenig, Christian wrote:
Am 25.10.19 um 11:22 schrieb S, Shirish:


On 10/25/2019 2:23 PM, Koenig, Christian wrote:

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

I dont see the scheduler being resumed when the ib is scheduled, its done way after the hardware is ready in reset code path.

Below are the logs:

amdgpu 0000:03:00.0: GPU reset begin!
amdgpu_device_gpu_recover calling drm_sched_stop             <==
...
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...
amdgpu_device_ip_resume_phase2 resumed sdma_v4_0
amdgpu_device_ip_resume_phase2 resumed powerplay
amdgpu_device_ip_resume_phase2 resumed dm
...
[drm] recover vram bo from shadow done
amdgpu_device_gpu_recover calling  drm_sched_start         <==
...

As mentioned in the call trace, drm_sched_main() is responsible for this job_run which seems to be called during cleanup.

Then the scheduler isn't stopped for some reason and we need to investigate why.

We used to have another kthread_park()/unpark() in drm_sched_entity_fini(), maybe an application is crashing while we are trying to reset the GPU?


We still have it and isn't doing kthread_park()/unpark() from drm_sched_entity_fini while GPU reset in progress defeats all the purpose of drm_sched_stop->kthread_park ? If drm_sched_entity_fini-> kthread_unpark happens AFTER drm_sched_stop->kthread_park nothing prevents from another (third) thread keep submitting job to HW which will be picked up by the unparked scheduler thread try to submit to HW but fail because the HW ring is deactivated.

If so maybe we should serialize calls to kthread_park/unpark(sched->thread) ?

Andrey


Would be rather unlikely, especially that would be rather hard to reproduce but currently my best bet what's going wrong here.

Regards,
Christian.


Regards,

Shirish S

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx




[-- Attachment #1.2: Type: text/html, Size: 17573 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25 15:57                                     ` Koenig, Christian
  0 siblings, 0 replies; 38+ messages in thread
From: Koenig, Christian @ 2019-10-25 15:57 UTC (permalink / raw)
  To: Grodzovsky, Andrey, S, Shirish, Deucher, Alexander
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 8874 bytes --]

Am 25.10.19 um 17:35 schrieb Grodzovsky, Andrey:


On 10/25/19 5:26 AM, Koenig, Christian wrote:
Am 25.10.19 um 11:22 schrieb S, Shirish:


On 10/25/2019 2:23 PM, Koenig, Christian wrote:

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

I dont see the scheduler being resumed when the ib is scheduled, its done way after the hardware is ready in reset code path.

Below are the logs:

amdgpu 0000:03:00.0: GPU reset begin!
amdgpu_device_gpu_recover calling drm_sched_stop             <==
...
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...
amdgpu_device_ip_resume_phase2 resumed sdma_v4_0
amdgpu_device_ip_resume_phase2 resumed powerplay
amdgpu_device_ip_resume_phase2 resumed dm
...
[drm] recover vram bo from shadow done
amdgpu_device_gpu_recover calling  drm_sched_start         <==
...

As mentioned in the call trace, drm_sched_main() is responsible for this job_run which seems to be called during cleanup.

Then the scheduler isn't stopped for some reason and we need to investigate why.

We used to have another kthread_park()/unpark() in drm_sched_entity_fini(), maybe an application is crashing while we are trying to reset the GPU?


We still have it and isn't doing kthread_park()/unpark() from drm_sched_entity_fini while GPU reset in progress defeats all the purpose of drm_sched_stop->kthread_park ? If drm_sched_entity_fini-> kthread_unpark happens AFTER drm_sched_stop->kthread_park nothing prevents from another (third) thread keep submitting job to HW which will be picked up by the unparked scheduler thread try to submit to HW but fail because the HW ring is deactivated.

If so maybe we should serialize calls to kthread_park/unpark(sched->thread) ?

Yeah, that was my thinking as well. Probably best to just grab the reset lock before calling drm_sched_entity_fini().

Alternative I think we could change the kthread_park/unpark to a wait_event_.... in drm_sched_entity_fini().

Regards,
Christian.


Andrey


Would be rather unlikely, especially that would be rather hard to reproduce but currently my best bet what's going wrong here.

Regards,
Christian.


Regards,

Shirish S

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx





[-- Attachment #1.2: Type: text/html, Size: 18244 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25 15:57                                     ` Koenig, Christian
  0 siblings, 0 replies; 38+ messages in thread
From: Koenig, Christian @ 2019-10-25 15:57 UTC (permalink / raw)
  To: Grodzovsky, Andrey, S, Shirish, Deucher, Alexander; +Cc: amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 8874 bytes --]

Am 25.10.19 um 17:35 schrieb Grodzovsky, Andrey:


On 10/25/19 5:26 AM, Koenig, Christian wrote:
Am 25.10.19 um 11:22 schrieb S, Shirish:


On 10/25/2019 2:23 PM, Koenig, Christian wrote:

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

I dont see the scheduler being resumed when the ib is scheduled, its done way after the hardware is ready in reset code path.

Below are the logs:

amdgpu 0000:03:00.0: GPU reset begin!
amdgpu_device_gpu_recover calling drm_sched_stop             <==
...
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...
amdgpu_device_ip_resume_phase2 resumed sdma_v4_0
amdgpu_device_ip_resume_phase2 resumed powerplay
amdgpu_device_ip_resume_phase2 resumed dm
...
[drm] recover vram bo from shadow done
amdgpu_device_gpu_recover calling  drm_sched_start         <==
...

As mentioned in the call trace, drm_sched_main() is responsible for this job_run which seems to be called during cleanup.

Then the scheduler isn't stopped for some reason and we need to investigate why.

We used to have another kthread_park()/unpark() in drm_sched_entity_fini(), maybe an application is crashing while we are trying to reset the GPU?


We still have it and isn't doing kthread_park()/unpark() from drm_sched_entity_fini while GPU reset in progress defeats all the purpose of drm_sched_stop->kthread_park ? If drm_sched_entity_fini-> kthread_unpark happens AFTER drm_sched_stop->kthread_park nothing prevents from another (third) thread keep submitting job to HW which will be picked up by the unparked scheduler thread try to submit to HW but fail because the HW ring is deactivated.

If so maybe we should serialize calls to kthread_park/unpark(sched->thread) ?

Yeah, that was my thinking as well. Probably best to just grab the reset lock before calling drm_sched_entity_fini().

Alternative I think we could change the kthread_park/unpark to a wait_event_.... in drm_sched_entity_fini().

Regards,
Christian.


Andrey


Would be rather unlikely, especially that would be rather hard to reproduce but currently my best bet what's going wrong here.

Regards,
Christian.


Regards,

Shirish S

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx





[-- Attachment #1.2: Type: text/html, Size: 18244 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25 16:02                                         ` Grodzovsky, Andrey
  0 siblings, 0 replies; 38+ messages in thread
From: Grodzovsky, Andrey @ 2019-10-25 16:02 UTC (permalink / raw)
  To: Koenig, Christian, S, Shirish, Deucher, Alexander
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 9090 bytes --]


On 10/25/19 11:57 AM, Koenig, Christian wrote:
Am 25.10.19 um 17:35 schrieb Grodzovsky, Andrey:


On 10/25/19 5:26 AM, Koenig, Christian wrote:
Am 25.10.19 um 11:22 schrieb S, Shirish:


On 10/25/2019 2:23 PM, Koenig, Christian wrote:

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

I dont see the scheduler being resumed when the ib is scheduled, its done way after the hardware is ready in reset code path.

Below are the logs:

amdgpu 0000:03:00.0: GPU reset begin!
amdgpu_device_gpu_recover calling drm_sched_stop             <==
...
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...
amdgpu_device_ip_resume_phase2 resumed sdma_v4_0
amdgpu_device_ip_resume_phase2 resumed powerplay
amdgpu_device_ip_resume_phase2 resumed dm
...
[drm] recover vram bo from shadow done
amdgpu_device_gpu_recover calling  drm_sched_start         <==
...

As mentioned in the call trace, drm_sched_main() is responsible for this job_run which seems to be called during cleanup.

Then the scheduler isn't stopped for some reason and we need to investigate why.

We used to have another kthread_park()/unpark() in drm_sched_entity_fini(), maybe an application is crashing while we are trying to reset the GPU?


We still have it and isn't doing kthread_park()/unpark() from drm_sched_entity_fini while GPU reset in progress defeats all the purpose of drm_sched_stop->kthread_park ? If drm_sched_entity_fini-> kthread_unpark happens AFTER drm_sched_stop->kthread_park nothing prevents from another (third) thread keep submitting job to HW which will be picked up by the unparked scheduler thread try to submit to HW but fail because the HW ring is deactivated.

If so maybe we should serialize calls to kthread_park/unpark(sched->thread) ?

Yeah, that was my thinking as well. Probably best to just grab the reset lock before calling drm_sched_entity_fini().


Shirish - please try locking &adev->lock_reset around calls to drm_sched_entity_fini as Christian suggests and see if this actually helps the issue.

Andrey


Alternative I think we could change the kthread_park/unpark to a wait_event_.... in drm_sched_entity_fini().

Regards,
Christian.


Andrey


Would be rather unlikely, especially that would be rather hard to reproduce but currently my best bet what's going wrong here.

Regards,
Christian.


Regards,

Shirish S

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx





[-- Attachment #1.2: Type: text/html, Size: 18920 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-25 16:02                                         ` Grodzovsky, Andrey
  0 siblings, 0 replies; 38+ messages in thread
From: Grodzovsky, Andrey @ 2019-10-25 16:02 UTC (permalink / raw)
  To: Koenig, Christian, S, Shirish, Deucher, Alexander; +Cc: amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 9090 bytes --]


On 10/25/19 11:57 AM, Koenig, Christian wrote:
Am 25.10.19 um 17:35 schrieb Grodzovsky, Andrey:


On 10/25/19 5:26 AM, Koenig, Christian wrote:
Am 25.10.19 um 11:22 schrieb S, Shirish:


On 10/25/2019 2:23 PM, Koenig, Christian wrote:

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

I dont see the scheduler being resumed when the ib is scheduled, its done way after the hardware is ready in reset code path.

Below are the logs:

amdgpu 0000:03:00.0: GPU reset begin!
amdgpu_device_gpu_recover calling drm_sched_stop             <==
...
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...
amdgpu_device_ip_resume_phase2 resumed sdma_v4_0
amdgpu_device_ip_resume_phase2 resumed powerplay
amdgpu_device_ip_resume_phase2 resumed dm
...
[drm] recover vram bo from shadow done
amdgpu_device_gpu_recover calling  drm_sched_start         <==
...

As mentioned in the call trace, drm_sched_main() is responsible for this job_run which seems to be called during cleanup.

Then the scheduler isn't stopped for some reason and we need to investigate why.

We used to have another kthread_park()/unpark() in drm_sched_entity_fini(), maybe an application is crashing while we are trying to reset the GPU?


We still have it and isn't doing kthread_park()/unpark() from drm_sched_entity_fini while GPU reset in progress defeats all the purpose of drm_sched_stop->kthread_park ? If drm_sched_entity_fini-> kthread_unpark happens AFTER drm_sched_stop->kthread_park nothing prevents from another (third) thread keep submitting job to HW which will be picked up by the unparked scheduler thread try to submit to HW but fail because the HW ring is deactivated.

If so maybe we should serialize calls to kthread_park/unpark(sched->thread) ?

Yeah, that was my thinking as well. Probably best to just grab the reset lock before calling drm_sched_entity_fini().


Shirish - please try locking &adev->lock_reset around calls to drm_sched_entity_fini as Christian suggests and see if this actually helps the issue.

Andrey


Alternative I think we could change the kthread_park/unpark to a wait_event_.... in drm_sched_entity_fini().

Regards,
Christian.


Andrey


Would be rather unlikely, especially that would be rather hard to reproduce but currently my best bet what's going wrong here.

Regards,
Christian.


Regards,

Shirish S

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx





[-- Attachment #1.2: Type: text/html, Size: 18920 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-30  8:44                                             ` S, Shirish
  0 siblings, 0 replies; 38+ messages in thread
From: S, Shirish @ 2019-10-30  8:44 UTC (permalink / raw)
  To: Grodzovsky, Andrey, Koenig, Christian, S, Shirish, Deucher, Alexander
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 9199 bytes --]


On 10/25/2019 9:32 PM, Grodzovsky, Andrey wrote:


On 10/25/19 11:57 AM, Koenig, Christian wrote:
Am 25.10.19 um 17:35 schrieb Grodzovsky, Andrey:


On 10/25/19 5:26 AM, Koenig, Christian wrote:
Am 25.10.19 um 11:22 schrieb S, Shirish:


On 10/25/2019 2:23 PM, Koenig, Christian wrote:

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

I dont see the scheduler being resumed when the ib is scheduled, its done way after the hardware is ready in reset code path.

Below are the logs:

amdgpu 0000:03:00.0: GPU reset begin!
amdgpu_device_gpu_recover calling drm_sched_stop             <==
...
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...
amdgpu_device_ip_resume_phase2 resumed sdma_v4_0
amdgpu_device_ip_resume_phase2 resumed powerplay
amdgpu_device_ip_resume_phase2 resumed dm
...
[drm] recover vram bo from shadow done
amdgpu_device_gpu_recover calling  drm_sched_start         <==
...

As mentioned in the call trace, drm_sched_main() is responsible for this job_run which seems to be called during cleanup.

Then the scheduler isn't stopped for some reason and we need to investigate why.

We used to have another kthread_park()/unpark() in drm_sched_entity_fini(), maybe an application is crashing while we are trying to reset the GPU?


We still have it and isn't doing kthread_park()/unpark() from drm_sched_entity_fini while GPU reset in progress defeats all the purpose of drm_sched_stop->kthread_park ? If drm_sched_entity_fini-> kthread_unpark happens AFTER drm_sched_stop->kthread_park nothing prevents from another (third) thread keep submitting job to HW which will be picked up by the unparked scheduler thread try to submit to HW but fail because the HW ring is deactivated.

If so maybe we should serialize calls to kthread_park/unpark(sched->thread) ?

Yeah, that was my thinking as well. Probably best to just grab the reset lock before calling drm_sched_entity_fini().


Shirish - please try locking &adev->lock_reset around calls to drm_sched_entity_fini as Christian suggests and see if this actually helps the issue.

Yes that also works.

Regards,

Shirish S

Andrey


Alternative I think we could change the kthread_park/unpark to a wait_event_.... in drm_sched_entity_fini().

Regards,
Christian.


Andrey


Would be rather unlikely, especially that would be rather hard to reproduce but currently my best bet what's going wrong here.

Regards,
Christian.


Regards,

Shirish S

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx








[-- Attachment #1.2: Type: text/html, Size: 19641 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-30  8:44                                             ` S, Shirish
  0 siblings, 0 replies; 38+ messages in thread
From: S, Shirish @ 2019-10-30  8:44 UTC (permalink / raw)
  To: Grodzovsky, Andrey, Koenig, Christian, S, Shirish, Deucher, Alexander
  Cc: amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 9199 bytes --]


On 10/25/2019 9:32 PM, Grodzovsky, Andrey wrote:


On 10/25/19 11:57 AM, Koenig, Christian wrote:
Am 25.10.19 um 17:35 schrieb Grodzovsky, Andrey:


On 10/25/19 5:26 AM, Koenig, Christian wrote:
Am 25.10.19 um 11:22 schrieb S, Shirish:


On 10/25/2019 2:23 PM, Koenig, Christian wrote:

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

This is what's the root of the problem.

The scheduler should never be resumed before we are done with bringing back the hardware into an usable state.

I dont see the scheduler being resumed when the ib is scheduled, its done way after the hardware is ready in reset code path.

Below are the logs:

amdgpu 0000:03:00.0: GPU reset begin!
amdgpu_device_gpu_recover calling drm_sched_stop             <==
...
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...
amdgpu_device_ip_resume_phase2 resumed sdma_v4_0
amdgpu_device_ip_resume_phase2 resumed powerplay
amdgpu_device_ip_resume_phase2 resumed dm
...
[drm] recover vram bo from shadow done
amdgpu_device_gpu_recover calling  drm_sched_start         <==
...

As mentioned in the call trace, drm_sched_main() is responsible for this job_run which seems to be called during cleanup.

Then the scheduler isn't stopped for some reason and we need to investigate why.

We used to have another kthread_park()/unpark() in drm_sched_entity_fini(), maybe an application is crashing while we are trying to reset the GPU?


We still have it and isn't doing kthread_park()/unpark() from drm_sched_entity_fini while GPU reset in progress defeats all the purpose of drm_sched_stop->kthread_park ? If drm_sched_entity_fini-> kthread_unpark happens AFTER drm_sched_stop->kthread_park nothing prevents from another (third) thread keep submitting job to HW which will be picked up by the unparked scheduler thread try to submit to HW but fail because the HW ring is deactivated.

If so maybe we should serialize calls to kthread_park/unpark(sched->thread) ?

Yeah, that was my thinking as well. Probably best to just grab the reset lock before calling drm_sched_entity_fini().


Shirish - please try locking &adev->lock_reset around calls to drm_sched_entity_fini as Christian suggests and see if this actually helps the issue.

Yes that also works.

Regards,

Shirish S

Andrey


Alternative I think we could change the kthread_park/unpark to a wait_event_.... in drm_sched_entity_fini().

Regards,
Christian.


Andrey


Would be rather unlikely, especially that would be rather hard to reproduce but currently my best bet what's going wrong here.

Regards,
Christian.


Regards,

Shirish S

Regards,
Christian.

Am 25.10.19 um 10:50 schrieb S, Shirish:

Here is the call trace:

Call Trace:
 dump_stack+0x4d/0x63
 amdgpu_ib_schedule+0x86/0x4b7
 ? __mod_timer+0x21e/0x244
 amdgpu_job_run+0x108/0x178
 drm_sched_main+0x253/0x2fa
 ? remove_wait_queue+0x51/0x51
 ? drm_sched_cleanup_jobs.part.12+0xda/0xda
 kthread+0x14f/0x157
 ? kthread_park+0x86/0x86
 ret_from_fork+0x22/0x40
amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)

printed via below change:

@@ -151,6 +152,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        }

        if (!ring->sched.ready) {
+              dump_stack();
                dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name);
                return -EINVAL;

On 10/24/2019 10:00 PM, Christian König wrote:
Am 24.10.19 um 17:06 schrieb Grodzovsky, Andrey:


On 10/24/19 7:01 AM, Christian König wrote:
Am 24.10.19 um 12:58 schrieb S, Shirish:
[Why]
Upon GPU reset, kernel cleans up already submitted jobs
via drm_sched_cleanup_jobs.
This schedules ib's via drm_sched_main()->run_job, leading to
race condition of rings being ready or not, since during reset
rings may be suspended.

NAK, exactly that's what should not happen.

The scheduler should be suspend while a GPU reset is in progress.

So you are running into a completely different race here.

Below is the series of events when the issue occurs.

(Note that as you & Andrey mentioned the scheduler has been suspended but the job is scheduled nonetheless.)

amdgpu 0000:03:00.0: GPU reset begin!

...

amdgpu_device_gpu_recover stopping ring sdma0 via drm_sched_stop

...

amdgpu 0000:03:00.0: GPU reset succeeded, trying to resume

amdgpu_do_asic_reset starting to resume blocks

...

amdgpu 0000:03:00.0: couldn't schedule ib on ring <sdma0>
[drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22)
...

amdgpu_device_ip_resume_phase2 resumed gfx_v9_0

amdgpu_device_ip_resume_phase2 resumed sdma_v4_0

amdgpu_device_ip_resume_phase2 resumed powerplay

...

FWIW, since the job is always NULL when "drm_sched_stop(&ring->sched, job ? &job->base : NULL);" when called during reset, all drm_sched_stop() does

is  cancel delayed work and park the sched->thread. There is no job list to be iterated to de-activate or remove or update fences.

Based on all this analysis, adding a mutex is more failsafe and less intrusive in the current code flow and lastly seems to be logical as well, hence I devised this approach


Please sync up with Andrey how this was able to happen.

Regards,
Christian.


Shirish - Christian makes a good point - note that in amdgpu_device_gpu_recover drm_sched_stop which stop all the scheduler threads is called way before we suspend the HW in amdgpu_device_pre_asic_reset->amdgpu_device_ip_suspend where SDMA suspension is happening and where the HW ring marked as not ready - please provide call stack for when you hit [drm:amdgpu_job_run] *ERROR* Error scheduling IBs (-22) to identify the code path which tried to submit the SDMA IB

Well the most likely cause of this is that the hardware failed to resume after the reset.

Infact hardware resume has not yet started, when the job is scheduled, which is the race am trying to address with this patch.

Regards,

Shirish S

Christian.


Andrey



[How]
make GPU reset's amdgpu_device_ip_resume_phase2() &
amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive.

Signed-off-by: Shirish S <shirish.s@amd.com><mailto:shirish.s@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4d9041..7b07a47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -973,6 +973,7 @@ struct amdgpu_device {
      bool                            in_gpu_reset;
      enum pp_mp1_state               mp1_state;
      struct mutex  lock_reset;
+    struct mutex  lock_ib_sched;
      struct amdgpu_doorbell_index doorbell_index;
        int asic_reset_res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad1..63cad74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
      mutex_init(&adev->virt.vf_errors.lock);
      hash_init(adev->mn_hash);
      mutex_init(&adev->lock_reset);
+    mutex_init(&adev->lock_ib_sched);
      mutex_init(&adev->virt.dpm_mutex);
      mutex_init(&adev->psp.mutex);
  @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                  if (r)
                      return r;
  +                mutex_lock(&tmp_adev->lock_ib_sched);
                  r = amdgpu_device_ip_resume_phase2(tmp_adev);
+                mutex_unlock(&tmp_adev->lock_ib_sched);
                  if (r)
                      goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e1bad99..cd6082d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
      if (finished->error < 0) {
          DRM_INFO("Skip scheduling IBs!\n");
      } else {
+        mutex_lock(&ring->adev->lock_ib_sched);
          r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
                         &fence);
+        mutex_unlock(&ring->adev->lock_ib_sched);
          if (r)
              DRM_ERROR("Error scheduling IBs (%d)\n", r);
      }




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx








[-- Attachment #1.2: Type: text/html, Size: 19641 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-30 14:44                                                 ` Grodzovsky, Andrey
  0 siblings, 0 replies; 38+ messages in thread
From: Grodzovsky, Andrey @ 2019-10-30 14:44 UTC (permalink / raw)
  To: S, Shirish, Koenig, Christian, Deucher, Alexander
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

That good  as proof of RCA but I still think we should grab a dedicated 
lock inside scheduler since the race is internal to scheduler code so 
this better to handle it inside the scheduler code to make the fix apply 
for all drivers using it.

Andrey

On 10/30/19 4:44 AM, S, Shirish wrote:
>>>>
>>>> We still have it and isn't doing kthread_park()/unpark() from 
>>>> drm_sched_entity_fini while GPU reset in progress defeats all the 
>>>> purpose of drm_sched_stop->kthread_park ? If 
>>>> drm_sched_entity_fini-> kthread_unpark happens AFTER 
>>>> drm_sched_stop->kthread_park nothing prevents from another (third) 
>>>> thread keep submitting job to HW which will be picked up by the 
>>>> unparked scheduler thread try to submit to HW but fail because the 
>>>> HW ring is deactivated.
>>>>
>>>> If so maybe we should serialize calls to 
>>>> kthread_park/unpark(sched->thread) ?
>>>>
>>>
>>> Yeah, that was my thinking as well. Probably best to just grab the 
>>> reset lock before calling drm_sched_entity_fini().
>>
>>
>> Shirish - please try locking &adev->lock_reset around calls to 
>> drm_sched_entity_fini as Christian suggests and see if this actually 
>> helps the issue.
>>
> Yes that also works.
>
> Regards,
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-30 14:44                                                 ` Grodzovsky, Andrey
  0 siblings, 0 replies; 38+ messages in thread
From: Grodzovsky, Andrey @ 2019-10-30 14:44 UTC (permalink / raw)
  To: S, Shirish, Koenig, Christian, Deucher, Alexander; +Cc: amd-gfx

That good  as proof of RCA but I still think we should grab a dedicated 
lock inside scheduler since the race is internal to scheduler code so 
this better to handle it inside the scheduler code to make the fix apply 
for all drivers using it.

Andrey

On 10/30/19 4:44 AM, S, Shirish wrote:
>>>>
>>>> We still have it and isn't doing kthread_park()/unpark() from 
>>>> drm_sched_entity_fini while GPU reset in progress defeats all the 
>>>> purpose of drm_sched_stop->kthread_park ? If 
>>>> drm_sched_entity_fini-> kthread_unpark happens AFTER 
>>>> drm_sched_stop->kthread_park nothing prevents from another (third) 
>>>> thread keep submitting job to HW which will be picked up by the 
>>>> unparked scheduler thread try to submit to HW but fail because the 
>>>> HW ring is deactivated.
>>>>
>>>> If so maybe we should serialize calls to 
>>>> kthread_park/unpark(sched->thread) ?
>>>>
>>>
>>> Yeah, that was my thinking as well. Probably best to just grab the 
>>> reset lock before calling drm_sched_entity_fini().
>>
>>
>> Shirish - please try locking &adev->lock_reset around calls to 
>> drm_sched_entity_fini as Christian suggests and see if this actually 
>> helps the issue.
>>
> Yes that also works.
>
> Regards,
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-30 14:50                                                     ` Christian König
  0 siblings, 0 replies; 38+ messages in thread
From: Christian König @ 2019-10-30 14:50 UTC (permalink / raw)
  To: Grodzovsky, Andrey, S, Shirish, Koenig, Christian, Deucher, Alexander
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

A lock inside the scheduler is rather tricky to implement.

What you need to do is to get rid of the park()/unpark() hack in 
drm_sched_entity_fini().

We could do this with a struct completion or convert the scheduler from 
a thread to a work item.

Regards,
Christian.

Am 30.10.19 um 15:44 schrieb Grodzovsky, Andrey:
> That good  as proof of RCA but I still think we should grab a dedicated
> lock inside scheduler since the race is internal to scheduler code so
> this better to handle it inside the scheduler code to make the fix apply
> for all drivers using it.
>
> Andrey
>
> On 10/30/19 4:44 AM, S, Shirish wrote:
>>>>> We still have it and isn't doing kthread_park()/unpark() from
>>>>> drm_sched_entity_fini while GPU reset in progress defeats all the
>>>>> purpose of drm_sched_stop->kthread_park ? If
>>>>> drm_sched_entity_fini-> kthread_unpark happens AFTER
>>>>> drm_sched_stop->kthread_park nothing prevents from another (third)
>>>>> thread keep submitting job to HW which will be picked up by the
>>>>> unparked scheduler thread try to submit to HW but fail because the
>>>>> HW ring is deactivated.
>>>>>
>>>>> If so maybe we should serialize calls to
>>>>> kthread_park/unpark(sched->thread) ?
>>>>>
>>>> Yeah, that was my thinking as well. Probably best to just grab the
>>>> reset lock before calling drm_sched_entity_fini().
>>>
>>> Shirish - please try locking &adev->lock_reset around calls to
>>> drm_sched_entity_fini as Christian suggests and see if this actually
>>> helps the issue.
>>>
>> Yes that also works.
>>
>> Regards,
>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-30 14:50                                                     ` Christian König
  0 siblings, 0 replies; 38+ messages in thread
From: Christian König @ 2019-10-30 14:50 UTC (permalink / raw)
  To: Grodzovsky, Andrey, S, Shirish, Koenig, Christian, Deucher, Alexander
  Cc: amd-gfx

A lock inside the scheduler is rather tricky to implement.

What you need to do is to get rid of the park()/unpark() hack in 
drm_sched_entity_fini().

We could do this with a struct completion or convert the scheduler from 
a thread to a work item.

Regards,
Christian.

Am 30.10.19 um 15:44 schrieb Grodzovsky, Andrey:
> That good  as proof of RCA but I still think we should grab a dedicated
> lock inside scheduler since the race is internal to scheduler code so
> this better to handle it inside the scheduler code to make the fix apply
> for all drivers using it.
>
> Andrey
>
> On 10/30/19 4:44 AM, S, Shirish wrote:
>>>>> We still have it and isn't doing kthread_park()/unpark() from
>>>>> drm_sched_entity_fini while GPU reset in progress defeats all the
>>>>> purpose of drm_sched_stop->kthread_park ? If
>>>>> drm_sched_entity_fini-> kthread_unpark happens AFTER
>>>>> drm_sched_stop->kthread_park nothing prevents from another (third)
>>>>> thread keep submitting job to HW which will be picked up by the
>>>>> unparked scheduler thread try to submit to HW but fail because the
>>>>> HW ring is deactivated.
>>>>>
>>>>> If so maybe we should serialize calls to
>>>>> kthread_park/unpark(sched->thread) ?
>>>>>
>>>> Yeah, that was my thinking as well. Probably best to just grab the
>>>> reset lock before calling drm_sched_entity_fini().
>>>
>>> Shirish - please try locking &adev->lock_reset around calls to
>>> drm_sched_entity_fini as Christian suggests and see if this actually
>>> helps the issue.
>>>
>> Yes that also works.
>>
>> Regards,
>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-30 14:56                                                         ` Grodzovsky, Andrey
  0 siblings, 0 replies; 38+ messages in thread
From: Grodzovsky, Andrey @ 2019-10-30 14:56 UTC (permalink / raw)
  To: Koenig, Christian, S, Shirish, Deucher, Alexander
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Can you elaborate on what is the tricky part with the lock ? I assumed 
we just use per scheduler lock.

Andrey

On 10/30/19 10:50 AM, Christian König wrote:
> A lock inside the scheduler is rather tricky to implement.
>
> What you need to do is to get rid of the park()/unpark() hack in 
> drm_sched_entity_fini().
>
> We could do this with a struct completion or convert the scheduler 
> from a thread to a work item.
>
> Regards,
> Christian.
>
> Am 30.10.19 um 15:44 schrieb Grodzovsky, Andrey:
>> That good  as proof of RCA but I still think we should grab a dedicated
>> lock inside scheduler since the race is internal to scheduler code so
>> this better to handle it inside the scheduler code to make the fix apply
>> for all drivers using it.
>>
>> Andrey
>>
>> On 10/30/19 4:44 AM, S, Shirish wrote:
>>>>>> We still have it and isn't doing kthread_park()/unpark() from
>>>>>> drm_sched_entity_fini while GPU reset in progress defeats all the
>>>>>> purpose of drm_sched_stop->kthread_park ? If
>>>>>> drm_sched_entity_fini-> kthread_unpark happens AFTER
>>>>>> drm_sched_stop->kthread_park nothing prevents from another (third)
>>>>>> thread keep submitting job to HW which will be picked up by the
>>>>>> unparked scheduler thread try to submit to HW but fail because the
>>>>>> HW ring is deactivated.
>>>>>>
>>>>>> If so maybe we should serialize calls to
>>>>>> kthread_park/unpark(sched->thread) ?
>>>>>>
>>>>> Yeah, that was my thinking as well. Probably best to just grab the
>>>>> reset lock before calling drm_sched_entity_fini().
>>>>
>>>> Shirish - please try locking &adev->lock_reset around calls to
>>>> drm_sched_entity_fini as Christian suggests and see if this actually
>>>> helps the issue.
>>>>
>>> Yes that also works.
>>>
>>> Regards,
>>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-30 14:56                                                         ` Grodzovsky, Andrey
  0 siblings, 0 replies; 38+ messages in thread
From: Grodzovsky, Andrey @ 2019-10-30 14:56 UTC (permalink / raw)
  To: Koenig, Christian, S, Shirish, Deucher, Alexander; +Cc: amd-gfx

Can you elaborate on what is the tricky part with the lock ? I assumed 
we just use per scheduler lock.

Andrey

On 10/30/19 10:50 AM, Christian König wrote:
> A lock inside the scheduler is rather tricky to implement.
>
> What you need to do is to get rid of the park()/unpark() hack in 
> drm_sched_entity_fini().
>
> We could do this with a struct completion or convert the scheduler 
> from a thread to a work item.
>
> Regards,
> Christian.
>
> Am 30.10.19 um 15:44 schrieb Grodzovsky, Andrey:
>> That good  as proof of RCA but I still think we should grab a dedicated
>> lock inside scheduler since the race is internal to scheduler code so
>> this better to handle it inside the scheduler code to make the fix apply
>> for all drivers using it.
>>
>> Andrey
>>
>> On 10/30/19 4:44 AM, S, Shirish wrote:
>>>>>> We still have it and isn't doing kthread_park()/unpark() from
>>>>>> drm_sched_entity_fini while GPU reset in progress defeats all the
>>>>>> purpose of drm_sched_stop->kthread_park ? If
>>>>>> drm_sched_entity_fini-> kthread_unpark happens AFTER
>>>>>> drm_sched_stop->kthread_park nothing prevents from another (third)
>>>>>> thread keep submitting job to HW which will be picked up by the
>>>>>> unparked scheduler thread try to submit to HW but fail because the
>>>>>> HW ring is deactivated.
>>>>>>
>>>>>> If so maybe we should serialize calls to
>>>>>> kthread_park/unpark(sched->thread) ?
>>>>>>
>>>>> Yeah, that was my thinking as well. Probably best to just grab the
>>>>> reset lock before calling drm_sched_entity_fini().
>>>>
>>>> Shirish - please try locking &adev->lock_reset around calls to
>>>> drm_sched_entity_fini as Christian suggests and see if this actually
>>>> helps the issue.
>>>>
>>> Yes that also works.
>>>
>>> Regards,
>>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-30 15:00                                                             ` Koenig, Christian
  0 siblings, 0 replies; 38+ messages in thread
From: Koenig, Christian @ 2019-10-30 15:00 UTC (permalink / raw)
  To: Grodzovsky, Andrey, S, Shirish, Deucher, Alexander
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Yeah, and exactly that's the problem :) You need a global lock covering 
all schedulers.

Otherwise you end up in hell's kitchen again with taking all those locks 
in the right order.

Christian.

Am 30.10.19 um 15:56 schrieb Grodzovsky, Andrey:
> Can you elaborate on what is the tricky part with the lock ? I assumed
> we just use per scheduler lock.
>
> Andrey
>
> On 10/30/19 10:50 AM, Christian König wrote:
>> A lock inside the scheduler is rather tricky to implement.
>>
>> What you need to do is to get rid of the park()/unpark() hack in
>> drm_sched_entity_fini().
>>
>> We could do this with a struct completion or convert the scheduler
>> from a thread to a work item.
>>
>> Regards,
>> Christian.
>>
>> Am 30.10.19 um 15:44 schrieb Grodzovsky, Andrey:
>>> That good  as proof of RCA but I still think we should grab a dedicated
>>> lock inside scheduler since the race is internal to scheduler code so
>>> this better to handle it inside the scheduler code to make the fix apply
>>> for all drivers using it.
>>>
>>> Andrey
>>>
>>> On 10/30/19 4:44 AM, S, Shirish wrote:
>>>>>>> We still have it and isn't doing kthread_park()/unpark() from
>>>>>>> drm_sched_entity_fini while GPU reset in progress defeats all the
>>>>>>> purpose of drm_sched_stop->kthread_park ? If
>>>>>>> drm_sched_entity_fini-> kthread_unpark happens AFTER
>>>>>>> drm_sched_stop->kthread_park nothing prevents from another (third)
>>>>>>> thread keep submitting job to HW which will be picked up by the
>>>>>>> unparked scheduler thread try to submit to HW but fail because the
>>>>>>> HW ring is deactivated.
>>>>>>>
>>>>>>> If so maybe we should serialize calls to
>>>>>>> kthread_park/unpark(sched->thread) ?
>>>>>>>
>>>>>> Yeah, that was my thinking as well. Probably best to just grab the
>>>>>> reset lock before calling drm_sched_entity_fini().
>>>>> Shirish - please try locking &adev->lock_reset around calls to
>>>>> drm_sched_entity_fini as Christian suggests and see if this actually
>>>>> helps the issue.
>>>>>
>>>> Yes that also works.
>>>>
>>>> Regards,
>>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-30 15:00                                                             ` Koenig, Christian
  0 siblings, 0 replies; 38+ messages in thread
From: Koenig, Christian @ 2019-10-30 15:00 UTC (permalink / raw)
  To: Grodzovsky, Andrey, S, Shirish, Deucher, Alexander; +Cc: amd-gfx

Yeah, and exactly that's the problem :) You need a global lock covering 
all schedulers.

Otherwise you end up in hell's kitchen again with taking all those locks 
in the right order.

Christian.

Am 30.10.19 um 15:56 schrieb Grodzovsky, Andrey:
> Can you elaborate on what is the tricky part with the lock ? I assumed
> we just use per scheduler lock.
>
> Andrey
>
> On 10/30/19 10:50 AM, Christian König wrote:
>> A lock inside the scheduler is rather tricky to implement.
>>
>> What you need to do is to get rid of the park()/unpark() hack in
>> drm_sched_entity_fini().
>>
>> We could do this with a struct completion or convert the scheduler
>> from a thread to a work item.
>>
>> Regards,
>> Christian.
>>
>> Am 30.10.19 um 15:44 schrieb Grodzovsky, Andrey:
>>> That good  as proof of RCA but I still think we should grab a dedicated
>>> lock inside scheduler since the race is internal to scheduler code so
>>> this better to handle it inside the scheduler code to make the fix apply
>>> for all drivers using it.
>>>
>>> Andrey
>>>
>>> On 10/30/19 4:44 AM, S, Shirish wrote:
>>>>>>> We still have it and isn't doing kthread_park()/unpark() from
>>>>>>> drm_sched_entity_fini while GPU reset in progress defeats all the
>>>>>>> purpose of drm_sched_stop->kthread_park ? If
>>>>>>> drm_sched_entity_fini-> kthread_unpark happens AFTER
>>>>>>> drm_sched_stop->kthread_park nothing prevents from another (third)
>>>>>>> thread keep submitting job to HW which will be picked up by the
>>>>>>> unparked scheduler thread try to submit to HW but fail because the
>>>>>>> HW ring is deactivated.
>>>>>>>
>>>>>>> If so maybe we should serialize calls to
>>>>>>> kthread_park/unpark(sched->thread) ?
>>>>>>>
>>>>>> Yeah, that was my thinking as well. Probably best to just grab the
>>>>>> reset lock before calling drm_sched_entity_fini().
>>>>> Shirish - please try locking &adev->lock_reset around calls to
>>>>> drm_sched_entity_fini as Christian suggests and see if this actually
>>>>> helps the issue.
>>>>>
>>>> Yes that also works.
>>>>
>>>> Regards,
>>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-30 15:05                                                                 ` Grodzovsky, Andrey
  0 siblings, 0 replies; 38+ messages in thread
From: Grodzovsky, Andrey @ 2019-10-30 15:05 UTC (permalink / raw)
  To: Koenig, Christian, S, Shirish, Deucher, Alexander
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

I see.

OK, I will add to myself a TODO about struct completion approach.

Andrey

On 10/30/19 11:00 AM, Koenig, Christian wrote:
> Yeah, and exactly that's the problem :) You need a global lock covering
> all schedulers.
>
> Otherwise you end up in hell's kitchen again with taking all those locks
> in the right order.
>
> Christian.
>
> Am 30.10.19 um 15:56 schrieb Grodzovsky, Andrey:
>> Can you elaborate on what is the tricky part with the lock ? I assumed
>> we just use per scheduler lock.
>>
>> Andrey
>>
>> On 10/30/19 10:50 AM, Christian König wrote:
>>> A lock inside the scheduler is rather tricky to implement.
>>>
>>> What you need to do is to get rid of the park()/unpark() hack in
>>> drm_sched_entity_fini().
>>>
>>> We could do this with a struct completion or convert the scheduler
>>> from a thread to a work item.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 30.10.19 um 15:44 schrieb Grodzovsky, Andrey:
>>>> That good  as proof of RCA but I still think we should grab a dedicated
>>>> lock inside scheduler since the race is internal to scheduler code so
>>>> this better to handle it inside the scheduler code to make the fix apply
>>>> for all drivers using it.
>>>>
>>>> Andrey
>>>>
>>>> On 10/30/19 4:44 AM, S, Shirish wrote:
>>>>>>>> We still have it and isn't doing kthread_park()/unpark() from
>>>>>>>> drm_sched_entity_fini while GPU reset in progress defeats all the
>>>>>>>> purpose of drm_sched_stop->kthread_park ? If
>>>>>>>> drm_sched_entity_fini-> kthread_unpark happens AFTER
>>>>>>>> drm_sched_stop->kthread_park nothing prevents from another (third)
>>>>>>>> thread keep submitting job to HW which will be picked up by the
>>>>>>>> unparked scheduler thread try to submit to HW but fail because the
>>>>>>>> HW ring is deactivated.
>>>>>>>>
>>>>>>>> If so maybe we should serialize calls to
>>>>>>>> kthread_park/unpark(sched->thread) ?
>>>>>>>>
>>>>>>> Yeah, that was my thinking as well. Probably best to just grab the
>>>>>>> reset lock before calling drm_sched_entity_fini().
>>>>>> Shirish - please try locking &adev->lock_reset around calls to
>>>>>> drm_sched_entity_fini as Christian suggests and see if this actually
>>>>>> helps the issue.
>>>>>>
>>>>> Yes that also works.
>>>>>
>>>>> Regards,
>>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] drm/amdgpu: guard ib scheduling while in reset
@ 2019-10-30 15:05                                                                 ` Grodzovsky, Andrey
  0 siblings, 0 replies; 38+ messages in thread
From: Grodzovsky, Andrey @ 2019-10-30 15:05 UTC (permalink / raw)
  To: Koenig, Christian, S, Shirish, Deucher, Alexander; +Cc: amd-gfx

I see.

OK, I will add to myself a TODO about struct completion approach.

Andrey

On 10/30/19 11:00 AM, Koenig, Christian wrote:
> Yeah, and exactly that's the problem :) You need a global lock covering
> all schedulers.
>
> Otherwise you end up in hell's kitchen again with taking all those locks
> in the right order.
>
> Christian.
>
> Am 30.10.19 um 15:56 schrieb Grodzovsky, Andrey:
>> Can you elaborate on what is the tricky part with the lock ? I assumed
>> we just use per scheduler lock.
>>
>> Andrey
>>
>> On 10/30/19 10:50 AM, Christian König wrote:
>>> A lock inside the scheduler is rather tricky to implement.
>>>
>>> What you need to do is to get rid of the park()/unpark() hack in
>>> drm_sched_entity_fini().
>>>
>>> We could do this with a struct completion or convert the scheduler
>>> from a thread to a work item.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 30.10.19 um 15:44 schrieb Grodzovsky, Andrey:
>>>> That good  as proof of RCA but I still think we should grab a dedicated
>>>> lock inside scheduler since the race is internal to scheduler code so
>>>> this better to handle it inside the scheduler code to make the fix apply
>>>> for all drivers using it.
>>>>
>>>> Andrey
>>>>
>>>> On 10/30/19 4:44 AM, S, Shirish wrote:
>>>>>>>> We still have it and isn't doing kthread_park()/unpark() from
>>>>>>>> drm_sched_entity_fini while GPU reset in progress defeats all the
>>>>>>>> purpose of drm_sched_stop->kthread_park ? If
>>>>>>>> drm_sched_entity_fini-> kthread_unpark happens AFTER
>>>>>>>> drm_sched_stop->kthread_park nothing prevents from another (third)
>>>>>>>> thread keep submitting job to HW which will be picked up by the
>>>>>>>> unparked scheduler thread try to submit to HW but fail because the
>>>>>>>> HW ring is deactivated.
>>>>>>>>
>>>>>>>> If so maybe we should serialize calls to
>>>>>>>> kthread_park/unpark(sched->thread) ?
>>>>>>>>
>>>>>>> Yeah, that was my thinking as well. Probably best to just grab the
>>>>>>> reset lock before calling drm_sched_entity_fini().
>>>>>> Shirish - please try locking &adev->lock_reset around calls to
>>>>>> drm_sched_entity_fini as Christian suggests and see if this actually
>>>>>> helps the issue.
>>>>>>
>>>>> Yes that also works.
>>>>>
>>>>> Regards,
>>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2019-10-30 15:05 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-24 10:58 [PATCH] drm/amdgpu: guard ib scheduling while in reset S, Shirish
2019-10-24 10:58 ` S, Shirish
     [not found] ` <1571914692-9430-1-git-send-email-shirish.s-5C7GfCeVMHo@public.gmane.org>
2019-10-24 11:01   ` Christian König
2019-10-24 11:01     ` Christian König
     [not found]     ` <23ea615d-5ef4-d0b3-a0ec-6fae67b102f2-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-10-24 15:06       ` Grodzovsky, Andrey
2019-10-24 15:06         ` Grodzovsky, Andrey
     [not found]         ` <f3be329d-d350-c821-00b7-d94858335796-5C7GfCeVMHo@public.gmane.org>
2019-10-24 16:30           ` Christian König
2019-10-24 16:30             ` Christian König
     [not found]             ` <d573688c-0997-1928-0c56-b60a29ff7fde-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-10-25  8:50               ` S, Shirish
2019-10-25  8:50                 ` S, Shirish
     [not found]                 ` <b5e99dc3-5658-7e48-63f7-bf9533f582f8-5C7GfCeVMHo@public.gmane.org>
2019-10-25  8:53                   ` Koenig, Christian
2019-10-25  8:53                     ` Koenig, Christian
     [not found]                     ` <2505c476-9e10-f70e-355c-33765a37d607-5C7GfCeVMHo@public.gmane.org>
2019-10-25  9:22                       ` S, Shirish
2019-10-25  9:22                         ` S, Shirish
     [not found]                         ` <a1c31f37-128f-51b1-f747-fe75d78d4214-5C7GfCeVMHo@public.gmane.org>
2019-10-25  9:26                           ` Koenig, Christian
2019-10-25  9:26                             ` Koenig, Christian
     [not found]                             ` <a9789f76-3ba5-ad71-1507-5eac6f589b82-5C7GfCeVMHo@public.gmane.org>
2019-10-25 10:08                               ` S, Shirish
2019-10-25 10:08                                 ` S, Shirish
     [not found]                                 ` <b9a72ccf-4d71-a2b9-63da-e5c19e9fbba6-5C7GfCeVMHo@public.gmane.org>
2019-10-25 10:32                                   ` Koenig, Christian
2019-10-25 10:32                                     ` Koenig, Christian
2019-10-25 15:35                               ` Grodzovsky, Andrey
2019-10-25 15:35                                 ` Grodzovsky, Andrey
     [not found]                                 ` <971115b1-6208-1dd5-d99f-c9377663a80b-5C7GfCeVMHo@public.gmane.org>
2019-10-25 15:57                                   ` Koenig, Christian
2019-10-25 15:57                                     ` Koenig, Christian
     [not found]                                     ` <2e2ebf73-9a25-5ad2-78e7-07c8b1db1b37-5C7GfCeVMHo@public.gmane.org>
2019-10-25 16:02                                       ` Grodzovsky, Andrey
2019-10-25 16:02                                         ` Grodzovsky, Andrey
     [not found]                                         ` <08e3c44f-5d08-d5f5-bc76-ea9b77032e5a-5C7GfCeVMHo@public.gmane.org>
2019-10-30  8:44                                           ` S, Shirish
2019-10-30  8:44                                             ` S, Shirish
     [not found]                                             ` <1e1d0b06-75ab-160c-a6c7-baede02f1e7d-5C7GfCeVMHo@public.gmane.org>
2019-10-30 14:44                                               ` Grodzovsky, Andrey
2019-10-30 14:44                                                 ` Grodzovsky, Andrey
     [not found]                                                 ` <f5b7aeff-c4ce-fa2f-1390-e8892fa7a964-5C7GfCeVMHo@public.gmane.org>
2019-10-30 14:50                                                   ` Christian König
2019-10-30 14:50                                                     ` Christian König
     [not found]                                                     ` <d73c46e2-ed85-f56e-3a2a-cbf2919d0a3f-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-10-30 14:56                                                       ` Grodzovsky, Andrey
2019-10-30 14:56                                                         ` Grodzovsky, Andrey
     [not found]                                                         ` <d881fbbe-5fb1-ea68-6490-d08d81c865dd-5C7GfCeVMHo@public.gmane.org>
2019-10-30 15:00                                                           ` Koenig, Christian
2019-10-30 15:00                                                             ` Koenig, Christian
     [not found]                                                             ` <90eb8377-82c4-968a-82f0-1409f69c17e5-5C7GfCeVMHo@public.gmane.org>
2019-10-30 15:05                                                               ` Grodzovsky, Andrey
2019-10-30 15:05                                                                 ` Grodzovsky, Andrey

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.