All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-07 10:25 ` Emily Deng
  0 siblings, 0 replies; 80+ messages in thread
From: Emily Deng @ 2019-11-07 10:25 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Emily Deng

When the job is already signaled, the s_fence is freed. Then it will has
null pointer in amdgpu_device_gpu_recover.

Signed-off-by: Emily Deng <Emily.Deng@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
 drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949..5a8f08e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	 *
 	 * job->base holds a reference to parent fence
 	 */
-	if (job && job->base.s_fence->parent &&
+	if (job && job->base.s_fence && job->base.s_fence->parent &&
 	    dma_fence_is_signaled(job->base.s_fence->parent))
 		job_signaled = true;
 
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 31809ca..56cc10e 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct drm_sched_job *bad)
 
 			spin_lock(&rq->lock);
 			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
-				if (bad->s_fence->scheduled.context ==
-				    entity->fence_context) {
+				if (bad->s_fence && (bad->s_fence->scheduled.context ==
+				    entity->fence_context)) {
 					if (atomic_read(&bad->karma) >
 					    bad->sched->hang_limit)
 						if (entity->guilty)
@@ -376,7 +376,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 	 * This iteration is thread safe as sched thread is stopped.
 	 */
 	list_for_each_entry_safe_reverse(s_job, tmp, &sched->ring_mirror_list, node) {
-		if (s_job->s_fence->parent &&
+		if (s_job->s_fence && s_job->s_fence->parent &&
 		    dma_fence_remove_callback(s_job->s_fence->parent,
 					      &s_job->cb)) {
 			atomic_dec(&sched->hw_rq_count);
@@ -395,7 +395,8 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 			 *
 			 * Job is still alive so fence refcount at least 1
 			 */
-			dma_fence_wait(&s_job->s_fence->finished, false);
+			if (s_job->s_fence)
+				dma_fence_wait(&s_job->s_fence->finished, false);
 
 			/*
 			 * We must keep bad job alive for later use during
@@ -438,7 +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 	 * GPU recovers can't run in parallel.
 	 */
 	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
-		struct dma_fence *fence = s_job->s_fence->parent;
+		struct dma_fence *fence = s_job->s_fence ? s_job->s_fence->parent : NULL;
 
 		atomic_inc(&sched->hw_rq_count);
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-07 10:25 ` Emily Deng
  0 siblings, 0 replies; 80+ messages in thread
From: Emily Deng @ 2019-11-07 10:25 UTC (permalink / raw)
  To: amd-gfx; +Cc: Emily Deng

When the job is already signaled, the s_fence is freed. Then it will has
null pointer in amdgpu_device_gpu_recover.

Signed-off-by: Emily Deng <Emily.Deng@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
 drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6ce949..5a8f08e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	 *
 	 * job->base holds a reference to parent fence
 	 */
-	if (job && job->base.s_fence->parent &&
+	if (job && job->base.s_fence && job->base.s_fence->parent &&
 	    dma_fence_is_signaled(job->base.s_fence->parent))
 		job_signaled = true;
 
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 31809ca..56cc10e 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct drm_sched_job *bad)
 
 			spin_lock(&rq->lock);
 			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
-				if (bad->s_fence->scheduled.context ==
-				    entity->fence_context) {
+				if (bad->s_fence && (bad->s_fence->scheduled.context ==
+				    entity->fence_context)) {
 					if (atomic_read(&bad->karma) >
 					    bad->sched->hang_limit)
 						if (entity->guilty)
@@ -376,7 +376,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 	 * This iteration is thread safe as sched thread is stopped.
 	 */
 	list_for_each_entry_safe_reverse(s_job, tmp, &sched->ring_mirror_list, node) {
-		if (s_job->s_fence->parent &&
+		if (s_job->s_fence && s_job->s_fence->parent &&
 		    dma_fence_remove_callback(s_job->s_fence->parent,
 					      &s_job->cb)) {
 			atomic_dec(&sched->hw_rq_count);
@@ -395,7 +395,8 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 			 *
 			 * Job is still alive so fence refcount at least 1
 			 */
-			dma_fence_wait(&s_job->s_fence->finished, false);
+			if (s_job->s_fence)
+				dma_fence_wait(&s_job->s_fence->finished, false);
 
 			/*
 			 * We must keep bad job alive for later use during
@@ -438,7 +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 	 * GPU recovers can't run in parallel.
 	 */
 	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
-		struct dma_fence *fence = s_job->s_fence->parent;
+		struct dma_fence *fence = s_job->s_fence ? s_job->s_fence->parent : NULL;
 
 		atomic_inc(&sched->hw_rq_count);
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-07 11:28     ` Christian König
  0 siblings, 0 replies; 80+ messages in thread
From: Christian König @ 2019-11-07 11:28 UTC (permalink / raw)
  To: Emily Deng, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 07.11.19 um 11:25 schrieb Emily Deng:
> When the job is already signaled, the s_fence is freed. Then it will has
> null pointer in amdgpu_device_gpu_recover.

NAK, the s_fence is only set to NULL when the job is destroyed. See 
drm_sched_job_cleanup().

When you see a job without an s_fence then that means the problem is 
somewhere else.

Regards,
Christian.

>
> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>   drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>   2 files changed, 7 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e6ce949..5a8f08e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	 *
>   	 * job->base holds a reference to parent fence
>   	 */
> -	if (job && job->base.s_fence->parent &&
> +	if (job && job->base.s_fence && job->base.s_fence->parent &&
>   	    dma_fence_is_signaled(job->base.s_fence->parent))
>   		job_signaled = true;
>   
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 31809ca..56cc10e 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct drm_sched_job *bad)
>   
>   			spin_lock(&rq->lock);
>   			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
> -				if (bad->s_fence->scheduled.context ==
> -				    entity->fence_context) {
> +				if (bad->s_fence && (bad->s_fence->scheduled.context ==
> +				    entity->fence_context)) {
>   					if (atomic_read(&bad->karma) >
>   					    bad->sched->hang_limit)
>   						if (entity->guilty)
> @@ -376,7 +376,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
>   	 * This iteration is thread safe as sched thread is stopped.
>   	 */
>   	list_for_each_entry_safe_reverse(s_job, tmp, &sched->ring_mirror_list, node) {
> -		if (s_job->s_fence->parent &&
> +		if (s_job->s_fence && s_job->s_fence->parent &&
>   		    dma_fence_remove_callback(s_job->s_fence->parent,
>   					      &s_job->cb)) {
>   			atomic_dec(&sched->hw_rq_count);
> @@ -395,7 +395,8 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
>   			 *
>   			 * Job is still alive so fence refcount at least 1
>   			 */
> -			dma_fence_wait(&s_job->s_fence->finished, false);
> +			if (s_job->s_fence)
> +				dma_fence_wait(&s_job->s_fence->finished, false);
>   
>   			/*
>   			 * We must keep bad job alive for later use during
> @@ -438,7 +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
>   	 * GPU recovers can't run in parallel.
>   	 */
>   	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
> -		struct dma_fence *fence = s_job->s_fence->parent;
> +		struct dma_fence *fence = s_job->s_fence ? s_job->s_fence->parent : NULL;
>   
>   		atomic_inc(&sched->hw_rq_count);
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-07 11:28     ` Christian König
  0 siblings, 0 replies; 80+ messages in thread
From: Christian König @ 2019-11-07 11:28 UTC (permalink / raw)
  To: Emily Deng, amd-gfx

Am 07.11.19 um 11:25 schrieb Emily Deng:
> When the job is already signaled, the s_fence is freed. Then it will has
> null pointer in amdgpu_device_gpu_recover.

NAK, the s_fence is only set to NULL when the job is destroyed. See 
drm_sched_job_cleanup().

When you see a job without an s_fence then that means the problem is 
somewhere else.

Regards,
Christian.

>
> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>   drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>   2 files changed, 7 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e6ce949..5a8f08e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	 *
>   	 * job->base holds a reference to parent fence
>   	 */
> -	if (job && job->base.s_fence->parent &&
> +	if (job && job->base.s_fence && job->base.s_fence->parent &&
>   	    dma_fence_is_signaled(job->base.s_fence->parent))
>   		job_signaled = true;
>   
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 31809ca..56cc10e 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct drm_sched_job *bad)
>   
>   			spin_lock(&rq->lock);
>   			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
> -				if (bad->s_fence->scheduled.context ==
> -				    entity->fence_context) {
> +				if (bad->s_fence && (bad->s_fence->scheduled.context ==
> +				    entity->fence_context)) {
>   					if (atomic_read(&bad->karma) >
>   					    bad->sched->hang_limit)
>   						if (entity->guilty)
> @@ -376,7 +376,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
>   	 * This iteration is thread safe as sched thread is stopped.
>   	 */
>   	list_for_each_entry_safe_reverse(s_job, tmp, &sched->ring_mirror_list, node) {
> -		if (s_job->s_fence->parent &&
> +		if (s_job->s_fence && s_job->s_fence->parent &&
>   		    dma_fence_remove_callback(s_job->s_fence->parent,
>   					      &s_job->cb)) {
>   			atomic_dec(&sched->hw_rq_count);
> @@ -395,7 +395,8 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
>   			 *
>   			 * Job is still alive so fence refcount at least 1
>   			 */
> -			dma_fence_wait(&s_job->s_fence->finished, false);
> +			if (s_job->s_fence)
> +				dma_fence_wait(&s_job->s_fence->finished, false);
>   
>   			/*
>   			 * We must keep bad job alive for later use during
> @@ -438,7 +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
>   	 * GPU recovers can't run in parallel.
>   	 */
>   	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
> -		struct dma_fence *fence = s_job->s_fence->parent;
> +		struct dma_fence *fence = s_job->s_fence ? s_job->s_fence->parent : NULL;
>   
>   		atomic_inc(&sched->hw_rq_count);
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08  2:55         ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-08  2:55 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

>-----Original Message-----
>From: Christian König <ckoenig.leichtzumerken@gmail.com>
>Sent: Thursday, November 7, 2019 7:28 PM
>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Am 07.11.19 um 11:25 schrieb Emily Deng:
>> When the job is already signaled, the s_fence is freed. Then it will
>> has null pointer in amdgpu_device_gpu_recover.
>
>NAK, the s_fence is only set to NULL when the job is destroyed. See
>drm_sched_job_cleanup().
I know it is set to NULL in drm_sched_job_cleanup. But in one case, when it enter into the amdgpu_device_gpu_recover, it already in drm_sched_job_cleanup, and at this time, it will go to free job. But the amdgpu_device_gpu_recover sometimes is faster. At
that time, job is not freed, but s_fence is already NULL.
>
>When you see a job without an s_fence then that means the problem is
>somewhere else.
>
>Regards,
>Christian.
>
>>
>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>   drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>   2 files changed, 7 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index e6ce949..5a8f08e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>amdgpu_device *adev,
>>   	 *
>>   	 * job->base holds a reference to parent fence
>>   	 */
>> -	if (job && job->base.s_fence->parent &&
>> +	if (job && job->base.s_fence && job->base.s_fence->parent &&
>>   	    dma_fence_is_signaled(job->base.s_fence->parent))
>>   		job_signaled = true;
>>
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>> b/drivers/gpu/drm/scheduler/sched_main.c
>> index 31809ca..56cc10e 100644
>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>drm_sched_job
>> *bad)
>>
>>   			spin_lock(&rq->lock);
>>   			list_for_each_entry_safe(entity, tmp, &rq->entities,
>list) {
>> -				if (bad->s_fence->scheduled.context ==
>> -				    entity->fence_context) {
>> +				if (bad->s_fence && (bad->s_fence-
>>scheduled.context ==
>> +				    entity->fence_context)) {
>>   					if (atomic_read(&bad->karma) >
>>   					    bad->sched->hang_limit)
>>   						if (entity->guilty)
>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct drm_gpu_scheduler
>*sched, struct drm_sched_job *bad)
>>   	 * This iteration is thread safe as sched thread is stopped.
>>   	 */
>>   	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>ring_mirror_list, node) {
>> -		if (s_job->s_fence->parent &&
>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>   		    dma_fence_remove_callback(s_job->s_fence->parent,
>>   					      &s_job->cb)) {
>>   			atomic_dec(&sched->hw_rq_count);
>> @@ -395,7 +395,8 @@ void drm_sched_stop(struct drm_gpu_scheduler
>*sched, struct drm_sched_job *bad)
>>   			 *
>>   			 * Job is still alive so fence refcount at least 1
>>   			 */
>> -			dma_fence_wait(&s_job->s_fence->finished, false);
>> +			if (s_job->s_fence)
>> +				dma_fence_wait(&s_job->s_fence->finished,
>false);
>>
>>   			/*
>>   			 * We must keep bad job alive for later use during @@
>-438,7
>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool
>full_recovery)
>>   	 * GPU recovers can't run in parallel.
>>   	 */
>>   	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node)
>{
>> -		struct dma_fence *fence = s_job->s_fence->parent;
>> +		struct dma_fence *fence = s_job->s_fence ? s_job->s_fence-
>>parent :
>> +NULL;
>>
>>   		atomic_inc(&sched->hw_rq_count);
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08  2:55         ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-08  2:55 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx

>-----Original Message-----
>From: Christian König <ckoenig.leichtzumerken@gmail.com>
>Sent: Thursday, November 7, 2019 7:28 PM
>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Am 07.11.19 um 11:25 schrieb Emily Deng:
>> When the job is already signaled, the s_fence is freed. Then it will
>> has null pointer in amdgpu_device_gpu_recover.
>
>NAK, the s_fence is only set to NULL when the job is destroyed. See
>drm_sched_job_cleanup().
I know it is set to NULL in drm_sched_job_cleanup. But in one case, when it enter into the amdgpu_device_gpu_recover, it already in drm_sched_job_cleanup, and at this time, it will go to free job. But the amdgpu_device_gpu_recover sometimes is faster. At
that time, job is not freed, but s_fence is already NULL.
>
>When you see a job without an s_fence then that means the problem is
>somewhere else.
>
>Regards,
>Christian.
>
>>
>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>   drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>   2 files changed, 7 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index e6ce949..5a8f08e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>amdgpu_device *adev,
>>   	 *
>>   	 * job->base holds a reference to parent fence
>>   	 */
>> -	if (job && job->base.s_fence->parent &&
>> +	if (job && job->base.s_fence && job->base.s_fence->parent &&
>>   	    dma_fence_is_signaled(job->base.s_fence->parent))
>>   		job_signaled = true;
>>
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>> b/drivers/gpu/drm/scheduler/sched_main.c
>> index 31809ca..56cc10e 100644
>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>drm_sched_job
>> *bad)
>>
>>   			spin_lock(&rq->lock);
>>   			list_for_each_entry_safe(entity, tmp, &rq->entities,
>list) {
>> -				if (bad->s_fence->scheduled.context ==
>> -				    entity->fence_context) {
>> +				if (bad->s_fence && (bad->s_fence-
>>scheduled.context ==
>> +				    entity->fence_context)) {
>>   					if (atomic_read(&bad->karma) >
>>   					    bad->sched->hang_limit)
>>   						if (entity->guilty)
>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct drm_gpu_scheduler
>*sched, struct drm_sched_job *bad)
>>   	 * This iteration is thread safe as sched thread is stopped.
>>   	 */
>>   	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>ring_mirror_list, node) {
>> -		if (s_job->s_fence->parent &&
>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>   		    dma_fence_remove_callback(s_job->s_fence->parent,
>>   					      &s_job->cb)) {
>>   			atomic_dec(&sched->hw_rq_count);
>> @@ -395,7 +395,8 @@ void drm_sched_stop(struct drm_gpu_scheduler
>*sched, struct drm_sched_job *bad)
>>   			 *
>>   			 * Job is still alive so fence refcount at least 1
>>   			 */
>> -			dma_fence_wait(&s_job->s_fence->finished, false);
>> +			if (s_job->s_fence)
>> +				dma_fence_wait(&s_job->s_fence->finished,
>false);
>>
>>   			/*
>>   			 * We must keep bad job alive for later use during @@
>-438,7
>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool
>full_recovery)
>>   	 * GPU recovers can't run in parallel.
>>   	 */
>>   	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node)
>{
>> -		struct dma_fence *fence = s_job->s_fence->parent;
>> +		struct dma_fence *fence = s_job->s_fence ? s_job->s_fence-
>>parent :
>> +NULL;
>>
>>   		atomic_inc(&sched->hw_rq_count);
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08  8:52             ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-08  8:52 UTC (permalink / raw)
  To: Deng, Emily, Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Ping.....


Best wishes
Emily Deng



>-----Original Message-----
>From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Deng,
>Emily
>Sent: Friday, November 8, 2019 10:56 AM
>To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>gfx@lists.freedesktop.org
>Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>>-----Original Message-----
>>From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>Sent: Thursday, November 7, 2019 7:28 PM
>>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>>Am 07.11.19 um 11:25 schrieb Emily Deng:
>>> When the job is already signaled, the s_fence is freed. Then it will
>>> has null pointer in amdgpu_device_gpu_recover.
>>
>>NAK, the s_fence is only set to NULL when the job is destroyed. See
>>drm_sched_job_cleanup().
>I know it is set to NULL in drm_sched_job_cleanup. But in one case, when it
>enter into the amdgpu_device_gpu_recover, it already in
>drm_sched_job_cleanup, and at this time, it will go to free job. But the
>amdgpu_device_gpu_recover sometimes is faster. At that time, job is not
>freed, but s_fence is already NULL.
>>
>>When you see a job without an s_fence then that means the problem is
>>somewhere else.
>>
>>Regards,
>>Christian.
>>
>>>
>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>   drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>   2 files changed, 7 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index e6ce949..5a8f08e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>>amdgpu_device *adev,
>>>   	 *
>>>   	 * job->base holds a reference to parent fence
>>>   	 */
>>> -	if (job && job->base.s_fence->parent &&
>>> +	if (job && job->base.s_fence && job->base.s_fence->parent &&
>>>   	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>   		job_signaled = true;
>>>
>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>> index 31809ca..56cc10e 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>drm_sched_job
>>> *bad)
>>>
>>>   			spin_lock(&rq->lock);
>>>   			list_for_each_entry_safe(entity, tmp, &rq->entities,
>>list) {
>>> -				if (bad->s_fence->scheduled.context ==
>>> -				    entity->fence_context) {
>>> +				if (bad->s_fence && (bad->s_fence-
>>>scheduled.context ==
>>> +				    entity->fence_context)) {
>>>   					if (atomic_read(&bad->karma) >
>>>   					    bad->sched->hang_limit)
>>>   						if (entity->guilty)
>>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct drm_gpu_scheduler
>>*sched, struct drm_sched_job *bad)
>>>   	 * This iteration is thread safe as sched thread is stopped.
>>>   	 */
>>>   	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>ring_mirror_list, node) {
>>> -		if (s_job->s_fence->parent &&
>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>   		    dma_fence_remove_callback(s_job->s_fence->parent,
>>>   					      &s_job->cb)) {
>>>   			atomic_dec(&sched->hw_rq_count); @@ -395,7
>+395,8 @@ void
>>> drm_sched_stop(struct drm_gpu_scheduler
>>*sched, struct drm_sched_job *bad)
>>>   			 *
>>>   			 * Job is still alive so fence refcount at least 1
>>>   			 */
>>> -			dma_fence_wait(&s_job->s_fence->finished, false);
>>> +			if (s_job->s_fence)
>>> +				dma_fence_wait(&s_job->s_fence->finished,
>>false);
>>>
>>>   			/*
>>>   			 * We must keep bad job alive for later use during @@
>>-438,7
>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool
>>full_recovery)
>>>   	 * GPU recovers can't run in parallel.
>>>   	 */
>>>   	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list,
>>> node)
>>{
>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>> +		struct dma_fence *fence = s_job->s_fence ? s_job->s_fence-
>>>parent :
>>> +NULL;
>>>
>>>   		atomic_inc(&sched->hw_rq_count);
>>>
>
>_______________________________________________
>amd-gfx mailing list
>amd-gfx@lists.freedesktop.org
>https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08  8:52             ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-08  8:52 UTC (permalink / raw)
  To: Deng, Emily, Koenig, Christian, amd-gfx

Ping.....


Best wishes
Emily Deng



>-----Original Message-----
>From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Deng,
>Emily
>Sent: Friday, November 8, 2019 10:56 AM
>To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>gfx@lists.freedesktop.org
>Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>>-----Original Message-----
>>From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>Sent: Thursday, November 7, 2019 7:28 PM
>>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>>Am 07.11.19 um 11:25 schrieb Emily Deng:
>>> When the job is already signaled, the s_fence is freed. Then it will
>>> has null pointer in amdgpu_device_gpu_recover.
>>
>>NAK, the s_fence is only set to NULL when the job is destroyed. See
>>drm_sched_job_cleanup().
>I know it is set to NULL in drm_sched_job_cleanup. But in one case, when it
>enter into the amdgpu_device_gpu_recover, it already in
>drm_sched_job_cleanup, and at this time, it will go to free job. But the
>amdgpu_device_gpu_recover sometimes is faster. At that time, job is not
>freed, but s_fence is already NULL.
>>
>>When you see a job without an s_fence then that means the problem is
>>somewhere else.
>>
>>Regards,
>>Christian.
>>
>>>
>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>   drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>   2 files changed, 7 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index e6ce949..5a8f08e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>>amdgpu_device *adev,
>>>   	 *
>>>   	 * job->base holds a reference to parent fence
>>>   	 */
>>> -	if (job && job->base.s_fence->parent &&
>>> +	if (job && job->base.s_fence && job->base.s_fence->parent &&
>>>   	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>   		job_signaled = true;
>>>
>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>> index 31809ca..56cc10e 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>drm_sched_job
>>> *bad)
>>>
>>>   			spin_lock(&rq->lock);
>>>   			list_for_each_entry_safe(entity, tmp, &rq->entities,
>>list) {
>>> -				if (bad->s_fence->scheduled.context ==
>>> -				    entity->fence_context) {
>>> +				if (bad->s_fence && (bad->s_fence-
>>>scheduled.context ==
>>> +				    entity->fence_context)) {
>>>   					if (atomic_read(&bad->karma) >
>>>   					    bad->sched->hang_limit)
>>>   						if (entity->guilty)
>>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct drm_gpu_scheduler
>>*sched, struct drm_sched_job *bad)
>>>   	 * This iteration is thread safe as sched thread is stopped.
>>>   	 */
>>>   	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>ring_mirror_list, node) {
>>> -		if (s_job->s_fence->parent &&
>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>   		    dma_fence_remove_callback(s_job->s_fence->parent,
>>>   					      &s_job->cb)) {
>>>   			atomic_dec(&sched->hw_rq_count); @@ -395,7
>+395,8 @@ void
>>> drm_sched_stop(struct drm_gpu_scheduler
>>*sched, struct drm_sched_job *bad)
>>>   			 *
>>>   			 * Job is still alive so fence refcount at least 1
>>>   			 */
>>> -			dma_fence_wait(&s_job->s_fence->finished, false);
>>> +			if (s_job->s_fence)
>>> +				dma_fence_wait(&s_job->s_fence->finished,
>>false);
>>>
>>>   			/*
>>>   			 * We must keep bad job alive for later use during @@
>>-438,7
>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool
>>full_recovery)
>>>   	 * GPU recovers can't run in parallel.
>>>   	 */
>>>   	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list,
>>> node)
>>{
>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>> +		struct dma_fence *fence = s_job->s_fence ? s_job->s_fence-
>>>parent :
>>> +NULL;
>>>
>>>   		atomic_inc(&sched->hw_rq_count);
>>>
>
>_______________________________________________
>amd-gfx mailing list
>amd-gfx@lists.freedesktop.org
>https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08  9:07                 ` Koenig, Christian
  0 siblings, 0 replies; 80+ messages in thread
From: Koenig, Christian @ 2019-11-08  9:07 UTC (permalink / raw)
  To: Deng, Emily, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 08.11.19 um 09:52 schrieb Deng, Emily:
> Ping.....

You need to give me at least enough time to wake up :)

>
>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Deng,
>> Emily
>> Sent: Friday, November 8, 2019 10:56 AM
>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>> gfx@lists.freedesktop.org
>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>>> -----Original Message-----
>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>> Sent: Thursday, November 7, 2019 7:28 PM
>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>> When the job is already signaled, the s_fence is freed. Then it will
>>>> has null pointer in amdgpu_device_gpu_recover.
>>> NAK, the s_fence is only set to NULL when the job is destroyed. See
>>> drm_sched_job_cleanup().
>> I know it is set to NULL in drm_sched_job_cleanup. But in one case, when it
>> enter into the amdgpu_device_gpu_recover, it already in
>> drm_sched_job_cleanup, and at this time, it will go to free job. But the
>> amdgpu_device_gpu_recover sometimes is faster. At that time, job is not
>> freed, but s_fence is already NULL.

No, that case can't happen. See here:

>         drm_sched_job_cleanup(s_job);
>
>         amdgpu_ring_priority_put(ring, s_job->s_priority);
>         dma_fence_put(job->fence);
>         amdgpu_sync_free(&job->sync);
>         amdgpu_sync_free(&job->sched_sync);
>         kfree(job);

The job itself is freed up directly after freeing the reference to the 
s_fence.

So you are just papering over a much bigger problem here. This patch is 
a clear NAK.

Regards,
Christian.

>>> When you see a job without an s_fence then that means the problem is
>>> somewhere else.
>>>
>>> Regards,
>>> Christian.
>>>
>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>    drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>    2 files changed, 7 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index e6ce949..5a8f08e 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>>>    	 *
>>>>    	 * job->base holds a reference to parent fence
>>>>    	 */
>>>> -	if (job && job->base.s_fence->parent &&
>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent &&
>>>>    	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>    		job_signaled = true;
>>>>
>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>> index 31809ca..56cc10e 100644
>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>> drm_sched_job
>>>> *bad)
>>>>
>>>>    			spin_lock(&rq->lock);
>>>>    			list_for_each_entry_safe(entity, tmp, &rq->entities,
>>> list) {
>>>> -				if (bad->s_fence->scheduled.context ==
>>>> -				    entity->fence_context) {
>>>> +				if (bad->s_fence && (bad->s_fence-
>>>> scheduled.context ==
>>>> +				    entity->fence_context)) {
>>>>    					if (atomic_read(&bad->karma) >
>>>>    					    bad->sched->hang_limit)
>>>>    						if (entity->guilty)
>>>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct drm_gpu_scheduler
>>> *sched, struct drm_sched_job *bad)
>>>>    	 * This iteration is thread safe as sched thread is stopped.
>>>>    	 */
>>>>    	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>> ring_mirror_list, node) {
>>>> -		if (s_job->s_fence->parent &&
>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>    		    dma_fence_remove_callback(s_job->s_fence->parent,
>>>>    					      &s_job->cb)) {
>>>>    			atomic_dec(&sched->hw_rq_count); @@ -395,7
>> +395,8 @@ void
>>>> drm_sched_stop(struct drm_gpu_scheduler
>>> *sched, struct drm_sched_job *bad)
>>>>    			 *
>>>>    			 * Job is still alive so fence refcount at least 1
>>>>    			 */
>>>> -			dma_fence_wait(&s_job->s_fence->finished, false);
>>>> +			if (s_job->s_fence)
>>>> +				dma_fence_wait(&s_job->s_fence->finished,
>>> false);
>>>>    			/*
>>>>    			 * We must keep bad job alive for later use during @@
>>> -438,7
>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool
>>> full_recovery)
>>>>    	 * GPU recovers can't run in parallel.
>>>>    	 */
>>>>    	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list,
>>>> node)
>>> {
>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job->s_fence-
>>>> parent :
>>>> +NULL;
>>>>
>>>>    		atomic_inc(&sched->hw_rq_count);
>>>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08  9:07                 ` Koenig, Christian
  0 siblings, 0 replies; 80+ messages in thread
From: Koenig, Christian @ 2019-11-08  9:07 UTC (permalink / raw)
  To: Deng, Emily, amd-gfx

Am 08.11.19 um 09:52 schrieb Deng, Emily:
> Ping.....

You need to give me at least enough time to wake up :)

>
>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Deng,
>> Emily
>> Sent: Friday, November 8, 2019 10:56 AM
>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>> gfx@lists.freedesktop.org
>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>>> -----Original Message-----
>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>> Sent: Thursday, November 7, 2019 7:28 PM
>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>> When the job is already signaled, the s_fence is freed. Then it will
>>>> has null pointer in amdgpu_device_gpu_recover.
>>> NAK, the s_fence is only set to NULL when the job is destroyed. See
>>> drm_sched_job_cleanup().
>> I know it is set to NULL in drm_sched_job_cleanup. But in one case, when it
>> enter into the amdgpu_device_gpu_recover, it already in
>> drm_sched_job_cleanup, and at this time, it will go to free job. But the
>> amdgpu_device_gpu_recover sometimes is faster. At that time, job is not
>> freed, but s_fence is already NULL.

No, that case can't happen. See here:

>         drm_sched_job_cleanup(s_job);
>
>         amdgpu_ring_priority_put(ring, s_job->s_priority);
>         dma_fence_put(job->fence);
>         amdgpu_sync_free(&job->sync);
>         amdgpu_sync_free(&job->sched_sync);
>         kfree(job);

The job itself is freed up directly after freeing the reference to the 
s_fence.

So you are just papering over a much bigger problem here. This patch is 
a clear NAK.

Regards,
Christian.

>>> When you see a job without an s_fence then that means the problem is
>>> somewhere else.
>>>
>>> Regards,
>>> Christian.
>>>
>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>    drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>    2 files changed, 7 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index e6ce949..5a8f08e 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>>>    	 *
>>>>    	 * job->base holds a reference to parent fence
>>>>    	 */
>>>> -	if (job && job->base.s_fence->parent &&
>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent &&
>>>>    	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>    		job_signaled = true;
>>>>
>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>> index 31809ca..56cc10e 100644
>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>> drm_sched_job
>>>> *bad)
>>>>
>>>>    			spin_lock(&rq->lock);
>>>>    			list_for_each_entry_safe(entity, tmp, &rq->entities,
>>> list) {
>>>> -				if (bad->s_fence->scheduled.context ==
>>>> -				    entity->fence_context) {
>>>> +				if (bad->s_fence && (bad->s_fence-
>>>> scheduled.context ==
>>>> +				    entity->fence_context)) {
>>>>    					if (atomic_read(&bad->karma) >
>>>>    					    bad->sched->hang_limit)
>>>>    						if (entity->guilty)
>>>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct drm_gpu_scheduler
>>> *sched, struct drm_sched_job *bad)
>>>>    	 * This iteration is thread safe as sched thread is stopped.
>>>>    	 */
>>>>    	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>> ring_mirror_list, node) {
>>>> -		if (s_job->s_fence->parent &&
>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>    		    dma_fence_remove_callback(s_job->s_fence->parent,
>>>>    					      &s_job->cb)) {
>>>>    			atomic_dec(&sched->hw_rq_count); @@ -395,7
>> +395,8 @@ void
>>>> drm_sched_stop(struct drm_gpu_scheduler
>>> *sched, struct drm_sched_job *bad)
>>>>    			 *
>>>>    			 * Job is still alive so fence refcount at least 1
>>>>    			 */
>>>> -			dma_fence_wait(&s_job->s_fence->finished, false);
>>>> +			if (s_job->s_fence)
>>>> +				dma_fence_wait(&s_job->s_fence->finished,
>>> false);
>>>>    			/*
>>>>    			 * We must keep bad job alive for later use during @@
>>> -438,7
>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool
>>> full_recovery)
>>>>    	 * GPU recovers can't run in parallel.
>>>>    	 */
>>>>    	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list,
>>>> node)
>>> {
>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job->s_fence-
>>>> parent :
>>>> +NULL;
>>>>
>>>>    		atomic_inc(&sched->hw_rq_count);
>>>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08  9:39                     ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-08  9:39 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Sorry, please take your time.

Best wishes
Emily Deng



>-----Original Message-----
>From: Koenig, Christian <Christian.Koenig@amd.com>
>Sent: Friday, November 8, 2019 5:08 PM
>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Am 08.11.19 um 09:52 schrieb Deng, Emily:
>> Ping.....
>
>You need to give me at least enough time to wake up :)
>
>>
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>> Deng, Emily
>>> Sent: Friday, November 8, 2019 10:56 AM
>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>> gfx@lists.freedesktop.org
>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>>> -----Original Message-----
>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>> When the job is already signaled, the s_fence is freed. Then it
>>>>> will has null pointer in amdgpu_device_gpu_recover.
>>>> NAK, the s_fence is only set to NULL when the job is destroyed. See
>>>> drm_sched_job_cleanup().
>>> I know it is set to NULL in drm_sched_job_cleanup. But in one case,
>>> when it enter into the amdgpu_device_gpu_recover, it already in
>>> drm_sched_job_cleanup, and at this time, it will go to free job. But
>>> the amdgpu_device_gpu_recover sometimes is faster. At that time, job
>>> is not freed, but s_fence is already NULL.
>
>No, that case can't happen. See here:
>
>>         drm_sched_job_cleanup(s_job);
>>
>>         amdgpu_ring_priority_put(ring, s_job->s_priority);
>>         dma_fence_put(job->fence);
>>         amdgpu_sync_free(&job->sync);
>>         amdgpu_sync_free(&job->sched_sync);
>>         kfree(job);
>
>The job itself is freed up directly after freeing the reference to the s_fence.
>
>So you are just papering over a much bigger problem here. This patch is a
>clear NAK.
>
>Regards,
>Christian.
>
>>>> When you see a job without an s_fence then that means the problem is
>>>> somewhere else.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>> ---
>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>    drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>    2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index e6ce949..5a8f08e 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>>>> amdgpu_device *adev,
>>>>>    	 *
>>>>>    	 * job->base holds a reference to parent fence
>>>>>    	 */
>>>>> -	if (job && job->base.s_fence->parent &&
>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent &&
>>>>>    	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>    		job_signaled = true;
>>>>>
>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>> index 31809ca..56cc10e 100644
>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>> drm_sched_job
>>>>> *bad)
>>>>>
>>>>>    			spin_lock(&rq->lock);
>>>>>    			list_for_each_entry_safe(entity, tmp, &rq->entities,
>>>> list) {
>>>>> -				if (bad->s_fence->scheduled.context ==
>>>>> -				    entity->fence_context) {
>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>> scheduled.context ==
>>>>> +				    entity->fence_context)) {
>>>>>    					if (atomic_read(&bad->karma) >
>>>>>    					    bad->sched->hang_limit)
>>>>>    						if (entity->guilty)
>>>>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct drm_gpu_scheduler
>>>> *sched, struct drm_sched_job *bad)
>>>>>    	 * This iteration is thread safe as sched thread is stopped.
>>>>>    	 */
>>>>>    	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>> ring_mirror_list, node) {
>>>>> -		if (s_job->s_fence->parent &&
>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>    		    dma_fence_remove_callback(s_job->s_fence->parent,
>>>>>    					      &s_job->cb)) {
>>>>>    			atomic_dec(&sched->hw_rq_count); @@ -395,7
>>> +395,8 @@ void
>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>> *sched, struct drm_sched_job *bad)
>>>>>    			 *
>>>>>    			 * Job is still alive so fence refcount at least 1
>>>>>    			 */
>>>>> -			dma_fence_wait(&s_job->s_fence->finished, false);
>>>>> +			if (s_job->s_fence)
>>>>> +				dma_fence_wait(&s_job->s_fence->finished,
>>>> false);
>>>>>    			/*
>>>>>    			 * We must keep bad job alive for later use during @@
>>>> -438,7
>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched,
>>>>> +bool
>>>> full_recovery)
>>>>>    	 * GPU recovers can't run in parallel.
>>>>>    	 */
>>>>>    	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list,
>>>>> node)
>>>> {
>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job->s_fence-
>>>>> parent :
>>>>> +NULL;
>>>>>
>>>>>    		atomic_inc(&sched->hw_rq_count);
>>>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08  9:39                     ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-08  9:39 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx

Sorry, please take your time.

Best wishes
Emily Deng



>-----Original Message-----
>From: Koenig, Christian <Christian.Koenig@amd.com>
>Sent: Friday, November 8, 2019 5:08 PM
>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Am 08.11.19 um 09:52 schrieb Deng, Emily:
>> Ping.....
>
>You need to give me at least enough time to wake up :)
>
>>
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>> Deng, Emily
>>> Sent: Friday, November 8, 2019 10:56 AM
>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>> gfx@lists.freedesktop.org
>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>>> -----Original Message-----
>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>> When the job is already signaled, the s_fence is freed. Then it
>>>>> will has null pointer in amdgpu_device_gpu_recover.
>>>> NAK, the s_fence is only set to NULL when the job is destroyed. See
>>>> drm_sched_job_cleanup().
>>> I know it is set to NULL in drm_sched_job_cleanup. But in one case,
>>> when it enter into the amdgpu_device_gpu_recover, it already in
>>> drm_sched_job_cleanup, and at this time, it will go to free job. But
>>> the amdgpu_device_gpu_recover sometimes is faster. At that time, job
>>> is not freed, but s_fence is already NULL.
>
>No, that case can't happen. See here:
>
>>         drm_sched_job_cleanup(s_job);
>>
>>         amdgpu_ring_priority_put(ring, s_job->s_priority);
>>         dma_fence_put(job->fence);
>>         amdgpu_sync_free(&job->sync);
>>         amdgpu_sync_free(&job->sched_sync);
>>         kfree(job);
>
>The job itself is freed up directly after freeing the reference to the s_fence.
>
>So you are just papering over a much bigger problem here. This patch is a
>clear NAK.
>
>Regards,
>Christian.
>
>>>> When you see a job without an s_fence then that means the problem is
>>>> somewhere else.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>> ---
>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>    drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>    2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index e6ce949..5a8f08e 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>>>> amdgpu_device *adev,
>>>>>    	 *
>>>>>    	 * job->base holds a reference to parent fence
>>>>>    	 */
>>>>> -	if (job && job->base.s_fence->parent &&
>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent &&
>>>>>    	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>    		job_signaled = true;
>>>>>
>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>> index 31809ca..56cc10e 100644
>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>> drm_sched_job
>>>>> *bad)
>>>>>
>>>>>    			spin_lock(&rq->lock);
>>>>>    			list_for_each_entry_safe(entity, tmp, &rq->entities,
>>>> list) {
>>>>> -				if (bad->s_fence->scheduled.context ==
>>>>> -				    entity->fence_context) {
>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>> scheduled.context ==
>>>>> +				    entity->fence_context)) {
>>>>>    					if (atomic_read(&bad->karma) >
>>>>>    					    bad->sched->hang_limit)
>>>>>    						if (entity->guilty)
>>>>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct drm_gpu_scheduler
>>>> *sched, struct drm_sched_job *bad)
>>>>>    	 * This iteration is thread safe as sched thread is stopped.
>>>>>    	 */
>>>>>    	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>> ring_mirror_list, node) {
>>>>> -		if (s_job->s_fence->parent &&
>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>    		    dma_fence_remove_callback(s_job->s_fence->parent,
>>>>>    					      &s_job->cb)) {
>>>>>    			atomic_dec(&sched->hw_rq_count); @@ -395,7
>>> +395,8 @@ void
>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>> *sched, struct drm_sched_job *bad)
>>>>>    			 *
>>>>>    			 * Job is still alive so fence refcount at least 1
>>>>>    			 */
>>>>> -			dma_fence_wait(&s_job->s_fence->finished, false);
>>>>> +			if (s_job->s_fence)
>>>>> +				dma_fence_wait(&s_job->s_fence->finished,
>>>> false);
>>>>>    			/*
>>>>>    			 * We must keep bad job alive for later use during @@
>>>> -438,7
>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched,
>>>>> +bool
>>>> full_recovery)
>>>>>    	 * GPU recovers can't run in parallel.
>>>>>    	 */
>>>>>    	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list,
>>>>> node)
>>>> {
>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job->s_fence-
>>>>> parent :
>>>>> +NULL;
>>>>>
>>>>>    		atomic_inc(&sched->hw_rq_count);
>>>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08  9:42                         ` Koenig, Christian
  0 siblings, 0 replies; 80+ messages in thread
From: Koenig, Christian @ 2019-11-08  9:42 UTC (permalink / raw)
  To: Deng, Emily, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 08.11.19 um 10:39 schrieb Deng, Emily:
> Sorry, please take your time.

Have you seen my other response a bit below?

I can't follow how it would be possible for job->s_fence to be NULL 
without the job also being freed.

So it looks like this patch is just papering over some bigger issues.

Regards,
Christian.

>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Friday, November 8, 2019 5:08 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>> Ping.....
>> You need to give me at least enough time to wake up :)
>>
>>>
>>> Best wishes
>>> Emily Deng
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>>> Deng, Emily
>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>> gfx@lists.freedesktop.org
>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>>> -----Original Message-----
>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>> When the job is already signaled, the s_fence is freed. Then it
>>>>>> will has null pointer in amdgpu_device_gpu_recover.
>>>>> NAK, the s_fence is only set to NULL when the job is destroyed. See
>>>>> drm_sched_job_cleanup().
>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one case,
>>>> when it enter into the amdgpu_device_gpu_recover, it already in
>>>> drm_sched_job_cleanup, and at this time, it will go to free job. But
>>>> the amdgpu_device_gpu_recover sometimes is faster. At that time, job
>>>> is not freed, but s_fence is already NULL.
>> No, that case can't happen. See here:
>>
>>>          drm_sched_job_cleanup(s_job);
>>>
>>>          amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>          dma_fence_put(job->fence);
>>>          amdgpu_sync_free(&job->sync);
>>>          amdgpu_sync_free(&job->sched_sync);
>>>          kfree(job);
>> The job itself is freed up directly after freeing the reference to the s_fence.
>>
>> So you are just papering over a much bigger problem here. This patch is a
>> clear NAK.
>>
>> Regards,
>> Christian.
>>
>>>>> When you see a job without an s_fence then that means the problem is
>>>>> somewhere else.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>> ---
>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>     drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>     2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> index e6ce949..5a8f08e 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>>>>> amdgpu_device *adev,
>>>>>>     	 *
>>>>>>     	 * job->base holds a reference to parent fence
>>>>>>     	 */
>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent &&
>>>>>>     	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>     		job_signaled = true;
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> index 31809ca..56cc10e 100644
>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>> drm_sched_job
>>>>>> *bad)
>>>>>>
>>>>>>     			spin_lock(&rq->lock);
>>>>>>     			list_for_each_entry_safe(entity, tmp, &rq->entities,
>>>>> list) {
>>>>>> -				if (bad->s_fence->scheduled.context ==
>>>>>> -				    entity->fence_context) {
>>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>>> scheduled.context ==
>>>>>> +				    entity->fence_context)) {
>>>>>>     					if (atomic_read(&bad->karma) >
>>>>>>     					    bad->sched->hang_limit)
>>>>>>     						if (entity->guilty)
>>>>>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct drm_gpu_scheduler
>>>>> *sched, struct drm_sched_job *bad)
>>>>>>     	 * This iteration is thread safe as sched thread is stopped.
>>>>>>     	 */
>>>>>>     	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>> ring_mirror_list, node) {
>>>>>> -		if (s_job->s_fence->parent &&
>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>     		    dma_fence_remove_callback(s_job->s_fence->parent,
>>>>>>     					      &s_job->cb)) {
>>>>>>     			atomic_dec(&sched->hw_rq_count); @@ -395,7
>>>> +395,8 @@ void
>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>> *sched, struct drm_sched_job *bad)
>>>>>>     			 *
>>>>>>     			 * Job is still alive so fence refcount at least 1
>>>>>>     			 */
>>>>>> -			dma_fence_wait(&s_job->s_fence->finished, false);
>>>>>> +			if (s_job->s_fence)
>>>>>> +				dma_fence_wait(&s_job->s_fence->finished,
>>>>> false);
>>>>>>     			/*
>>>>>>     			 * We must keep bad job alive for later use during @@
>>>>> -438,7
>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched,
>>>>>> +bool
>>>>> full_recovery)
>>>>>>     	 * GPU recovers can't run in parallel.
>>>>>>     	 */
>>>>>>     	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list,
>>>>>> node)
>>>>> {
>>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job->s_fence-
>>>>>> parent :
>>>>>> +NULL;
>>>>>>
>>>>>>     		atomic_inc(&sched->hw_rq_count);
>>>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08  9:42                         ` Koenig, Christian
  0 siblings, 0 replies; 80+ messages in thread
From: Koenig, Christian @ 2019-11-08  9:42 UTC (permalink / raw)
  To: Deng, Emily, amd-gfx

Am 08.11.19 um 10:39 schrieb Deng, Emily:
> Sorry, please take your time.

Have you seen my other response a bit below?

I can't follow how it would be possible for job->s_fence to be NULL 
without the job also being freed.

So it looks like this patch is just papering over some bigger issues.

Regards,
Christian.

>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Friday, November 8, 2019 5:08 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>> Ping.....
>> You need to give me at least enough time to wake up :)
>>
>>>
>>> Best wishes
>>> Emily Deng
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>>> Deng, Emily
>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>> gfx@lists.freedesktop.org
>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>>> -----Original Message-----
>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>> When the job is already signaled, the s_fence is freed. Then it
>>>>>> will has null pointer in amdgpu_device_gpu_recover.
>>>>> NAK, the s_fence is only set to NULL when the job is destroyed. See
>>>>> drm_sched_job_cleanup().
>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one case,
>>>> when it enter into the amdgpu_device_gpu_recover, it already in
>>>> drm_sched_job_cleanup, and at this time, it will go to free job. But
>>>> the amdgpu_device_gpu_recover sometimes is faster. At that time, job
>>>> is not freed, but s_fence is already NULL.
>> No, that case can't happen. See here:
>>
>>>          drm_sched_job_cleanup(s_job);
>>>
>>>          amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>          dma_fence_put(job->fence);
>>>          amdgpu_sync_free(&job->sync);
>>>          amdgpu_sync_free(&job->sched_sync);
>>>          kfree(job);
>> The job itself is freed up directly after freeing the reference to the s_fence.
>>
>> So you are just papering over a much bigger problem here. This patch is a
>> clear NAK.
>>
>> Regards,
>> Christian.
>>
>>>>> When you see a job without an s_fence then that means the problem is
>>>>> somewhere else.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>> ---
>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>     drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>     2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> index e6ce949..5a8f08e 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>>>>> amdgpu_device *adev,
>>>>>>     	 *
>>>>>>     	 * job->base holds a reference to parent fence
>>>>>>     	 */
>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent &&
>>>>>>     	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>     		job_signaled = true;
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> index 31809ca..56cc10e 100644
>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>> drm_sched_job
>>>>>> *bad)
>>>>>>
>>>>>>     			spin_lock(&rq->lock);
>>>>>>     			list_for_each_entry_safe(entity, tmp, &rq->entities,
>>>>> list) {
>>>>>> -				if (bad->s_fence->scheduled.context ==
>>>>>> -				    entity->fence_context) {
>>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>>> scheduled.context ==
>>>>>> +				    entity->fence_context)) {
>>>>>>     					if (atomic_read(&bad->karma) >
>>>>>>     					    bad->sched->hang_limit)
>>>>>>     						if (entity->guilty)
>>>>>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct drm_gpu_scheduler
>>>>> *sched, struct drm_sched_job *bad)
>>>>>>     	 * This iteration is thread safe as sched thread is stopped.
>>>>>>     	 */
>>>>>>     	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>> ring_mirror_list, node) {
>>>>>> -		if (s_job->s_fence->parent &&
>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>     		    dma_fence_remove_callback(s_job->s_fence->parent,
>>>>>>     					      &s_job->cb)) {
>>>>>>     			atomic_dec(&sched->hw_rq_count); @@ -395,7
>>>> +395,8 @@ void
>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>> *sched, struct drm_sched_job *bad)
>>>>>>     			 *
>>>>>>     			 * Job is still alive so fence refcount at least 1
>>>>>>     			 */
>>>>>> -			dma_fence_wait(&s_job->s_fence->finished, false);
>>>>>> +			if (s_job->s_fence)
>>>>>> +				dma_fence_wait(&s_job->s_fence->finished,
>>>>> false);
>>>>>>     			/*
>>>>>>     			 * We must keep bad job alive for later use during @@
>>>>> -438,7
>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched,
>>>>>> +bool
>>>>> full_recovery)
>>>>>>     	 * GPU recovers can't run in parallel.
>>>>>>     	 */
>>>>>>     	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list,
>>>>>> node)
>>>>> {
>>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job->s_fence-
>>>>>> parent :
>>>>>> +NULL;
>>>>>>
>>>>>>     		atomic_inc(&sched->hw_rq_count);
>>>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 10:11                             ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-08 10:11 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Christian,
     Please refer to follow log, when it enter to amdgpu_device_gpu_recover function, the bad job 000000005086879e is freeing in function  amdgpu_job_free_cb  at the same time, because of the hardware fence signal. But amdgpu_device_gpu_recover goes faster, at this case, the s_fence is already freed, but job is not freed in time. Then this issue occurs.

[  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0 timeout, signaled seq=2481, emitted seq=2483
[  449.793202] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: process  pid 0 thread  pid 0, s_job:000000005086879e
[  449.794163] amdgpu 0000:00:08.0: GPU reset begin!
[  449.794175] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:000000005086879e
[  449.794221] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:0000000066eb74ab
[  449.794222] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000d4438ad9
[  449.794255] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000b6d69c65
[  449.794257] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000ea85e922
[  449.794287] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6
[  449.794366] BUG: unable to handle kernel NULL pointer dereference at 00000000000000c0
[  449.800818] PGD 0 P4D 0
[  449.801040] Oops: 0000 [#1] SMP PTI
[  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE     4.18.0-15-generic #16~18.04.1-Ubuntu
[  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  449.802944] Workqueue: events drm_sched_job_timedout [amd_sched]
[  449.803488] RIP: 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
[  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff ff 45 85 e4 0f 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10 <48> 8b 98 c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
[  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286
[  449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[  449.806625] RDX: ffffb4c7c08f5ac0 RSI: 0000000fffffffe0 RDI: 0000000000000246
[  449.807224] RBP: ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000
[  449.807818] R10: 0000000000000000 R11: 0000000000000148 R12: 0000000000000000
[  449.808411] R13: ffffb4c7c08f7da0 R14: ffff8d82b8525d40 R15: ffff8d82b8525d40
[  449.809004] FS:  0000000000000000(0000) GS:ffff8d82bfd80000(0000) knlGS:0000000000000000
[  449.809674] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  449.810153] CR2: 00000000000000c0 CR3: 000000003cc0a001 CR4: 00000000003606e0
[  449.810747] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  449.811937] Call Trace:
[  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu]
[  449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched]
[  449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu]
[  449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched]
[  449.814077]  process_one_work+0x1fd/0x3f0
[  449.814417]  worker_thread+0x34/0x410
[  449.814728]  kthread+0x121/0x140
[  449.815004]  ? process_one_work+0x3f0/0x3f0
[  449.815374]  ? kthread_create_worker_on_cpu+0x70/0x70
[  449.815799]  ret_from_fork+0x35/0x40

>-----Original Message-----
>From: Koenig, Christian <Christian.Koenig@amd.com>
>Sent: Friday, November 8, 2019 5:43 PM
>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Am 08.11.19 um 10:39 schrieb Deng, Emily:
>> Sorry, please take your time.
>
>Have you seen my other response a bit below?
>
>I can't follow how it would be possible for job->s_fence to be NULL without
>the job also being freed.
>
>So it looks like this patch is just papering over some bigger issues.
>
>Regards,
>Christian.
>
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Friday, November 8, 2019 5:08 PM
>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>> Ping.....
>>> You need to give me at least enough time to wake up :)
>>>
>>>>
>>>> Best wishes
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>>>> Deng, Emily
>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>> gfx@lists.freedesktop.org
>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>> tdr
>>>>>>
>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>> When the job is already signaled, the s_fence is freed. Then it
>>>>>>> will has null pointer in amdgpu_device_gpu_recover.
>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>> See drm_sched_job_cleanup().
>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one case,
>>>>> when it enter into the amdgpu_device_gpu_recover, it already in
>>>>> drm_sched_job_cleanup, and at this time, it will go to free job.
>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At that
>>>>> time, job is not freed, but s_fence is already NULL.
>>> No, that case can't happen. See here:
>>>
>>>>          drm_sched_job_cleanup(s_job);
>>>>
>>>>          amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>          dma_fence_put(job->fence);
>>>>          amdgpu_sync_free(&job->sync);
>>>>          amdgpu_sync_free(&job->sched_sync);
>>>>          kfree(job);
>>> The job itself is freed up directly after freeing the reference to the s_fence.
>>>
>>> So you are just papering over a much bigger problem here. This patch
>>> is a clear NAK.
>>>
>>> Regards,
>>> Christian.
>>>
>>>>>> When you see a job without an s_fence then that means the problem
>>>>>> is somewhere else.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>> ---
>>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>     drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>>     2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>>>>>> amdgpu_device *adev,
>>>>>>>     	 *
>>>>>>>     	 * job->base holds a reference to parent fence
>>>>>>>     	 */
>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent
>&&
>>>>>>>     	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>     		job_signaled = true;
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> index 31809ca..56cc10e 100644
>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>>> drm_sched_job
>>>>>>> *bad)
>>>>>>>
>>>>>>>     			spin_lock(&rq->lock);
>>>>>>>     			list_for_each_entry_safe(entity, tmp, &rq-
>>entities,
>>>>>> list) {
>>>>>>> -				if (bad->s_fence->scheduled.context
>==
>>>>>>> -				    entity->fence_context) {
>>>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>>>> scheduled.context ==
>>>>>>> +				    entity->fence_context)) {
>>>>>>>     					if (atomic_read(&bad-
>>karma) >
>>>>>>>     					    bad->sched->hang_limit)
>>>>>>>     						if (entity->guilty)
>>>>>>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct
>drm_gpu_scheduler
>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>     	 * This iteration is thread safe as sched thread is stopped.
>>>>>>>     	 */
>>>>>>>     	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>>> ring_mirror_list, node) {
>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>     		    dma_fence_remove_callback(s_job->s_fence-
>>parent,
>>>>>>>     					      &s_job->cb)) {
>>>>>>>     			atomic_dec(&sched->hw_rq_count); @@ -
>395,7
>>>>> +395,8 @@ void
>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>     			 *
>>>>>>>     			 * Job is still alive so fence refcount at least 1
>>>>>>>     			 */
>>>>>>> -			dma_fence_wait(&s_job->s_fence->finished,
>false);
>>>>>>> +			if (s_job->s_fence)
>>>>>>> +				dma_fence_wait(&s_job->s_fence-
>>finished,
>>>>>> false);
>>>>>>>     			/*
>>>>>>>     			 * We must keep bad job alive for later use
>during @@
>>>>>> -438,7
>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched,
>>>>>>> +bool
>>>>>> full_recovery)
>>>>>>>     	 * GPU recovers can't run in parallel.
>>>>>>>     	 */
>>>>>>>     	list_for_each_entry_safe(s_job, tmp,
>>>>>>> &sched->ring_mirror_list,
>>>>>>> node)
>>>>>> {
>>>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job-
>>s_fence-
>>>>>>> parent :
>>>>>>> +NULL;
>>>>>>>
>>>>>>>     		atomic_inc(&sched->hw_rq_count);
>>>>>>>
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 10:11                             ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-08 10:11 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx

Hi Christian,
     Please refer to follow log, when it enter to amdgpu_device_gpu_recover function, the bad job 000000005086879e is freeing in function  amdgpu_job_free_cb  at the same time, because of the hardware fence signal. But amdgpu_device_gpu_recover goes faster, at this case, the s_fence is already freed, but job is not freed in time. Then this issue occurs.

[  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0 timeout, signaled seq=2481, emitted seq=2483
[  449.793202] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: process  pid 0 thread  pid 0, s_job:000000005086879e
[  449.794163] amdgpu 0000:00:08.0: GPU reset begin!
[  449.794175] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:000000005086879e
[  449.794221] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:0000000066eb74ab
[  449.794222] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000d4438ad9
[  449.794255] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000b6d69c65
[  449.794257] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000ea85e922
[  449.794287] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6
[  449.794366] BUG: unable to handle kernel NULL pointer dereference at 00000000000000c0
[  449.800818] PGD 0 P4D 0
[  449.801040] Oops: 0000 [#1] SMP PTI
[  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE     4.18.0-15-generic #16~18.04.1-Ubuntu
[  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  449.802944] Workqueue: events drm_sched_job_timedout [amd_sched]
[  449.803488] RIP: 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
[  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff ff 45 85 e4 0f 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10 <48> 8b 98 c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
[  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286
[  449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[  449.806625] RDX: ffffb4c7c08f5ac0 RSI: 0000000fffffffe0 RDI: 0000000000000246
[  449.807224] RBP: ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000
[  449.807818] R10: 0000000000000000 R11: 0000000000000148 R12: 0000000000000000
[  449.808411] R13: ffffb4c7c08f7da0 R14: ffff8d82b8525d40 R15: ffff8d82b8525d40
[  449.809004] FS:  0000000000000000(0000) GS:ffff8d82bfd80000(0000) knlGS:0000000000000000
[  449.809674] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  449.810153] CR2: 00000000000000c0 CR3: 000000003cc0a001 CR4: 00000000003606e0
[  449.810747] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  449.811937] Call Trace:
[  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu]
[  449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched]
[  449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu]
[  449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched]
[  449.814077]  process_one_work+0x1fd/0x3f0
[  449.814417]  worker_thread+0x34/0x410
[  449.814728]  kthread+0x121/0x140
[  449.815004]  ? process_one_work+0x3f0/0x3f0
[  449.815374]  ? kthread_create_worker_on_cpu+0x70/0x70
[  449.815799]  ret_from_fork+0x35/0x40

>-----Original Message-----
>From: Koenig, Christian <Christian.Koenig@amd.com>
>Sent: Friday, November 8, 2019 5:43 PM
>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Am 08.11.19 um 10:39 schrieb Deng, Emily:
>> Sorry, please take your time.
>
>Have you seen my other response a bit below?
>
>I can't follow how it would be possible for job->s_fence to be NULL without
>the job also being freed.
>
>So it looks like this patch is just papering over some bigger issues.
>
>Regards,
>Christian.
>
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Friday, November 8, 2019 5:08 PM
>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>> Ping.....
>>> You need to give me at least enough time to wake up :)
>>>
>>>>
>>>> Best wishes
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>>>> Deng, Emily
>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>> gfx@lists.freedesktop.org
>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>> tdr
>>>>>>
>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>> When the job is already signaled, the s_fence is freed. Then it
>>>>>>> will has null pointer in amdgpu_device_gpu_recover.
>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>> See drm_sched_job_cleanup().
>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one case,
>>>>> when it enter into the amdgpu_device_gpu_recover, it already in
>>>>> drm_sched_job_cleanup, and at this time, it will go to free job.
>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At that
>>>>> time, job is not freed, but s_fence is already NULL.
>>> No, that case can't happen. See here:
>>>
>>>>          drm_sched_job_cleanup(s_job);
>>>>
>>>>          amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>          dma_fence_put(job->fence);
>>>>          amdgpu_sync_free(&job->sync);
>>>>          amdgpu_sync_free(&job->sched_sync);
>>>>          kfree(job);
>>> The job itself is freed up directly after freeing the reference to the s_fence.
>>>
>>> So you are just papering over a much bigger problem here. This patch
>>> is a clear NAK.
>>>
>>> Regards,
>>> Christian.
>>>
>>>>>> When you see a job without an s_fence then that means the problem
>>>>>> is somewhere else.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>> ---
>>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>     drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>>     2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>>>>>> amdgpu_device *adev,
>>>>>>>     	 *
>>>>>>>     	 * job->base holds a reference to parent fence
>>>>>>>     	 */
>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent
>&&
>>>>>>>     	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>     		job_signaled = true;
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> index 31809ca..56cc10e 100644
>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>>> drm_sched_job
>>>>>>> *bad)
>>>>>>>
>>>>>>>     			spin_lock(&rq->lock);
>>>>>>>     			list_for_each_entry_safe(entity, tmp, &rq-
>>entities,
>>>>>> list) {
>>>>>>> -				if (bad->s_fence->scheduled.context
>==
>>>>>>> -				    entity->fence_context) {
>>>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>>>> scheduled.context ==
>>>>>>> +				    entity->fence_context)) {
>>>>>>>     					if (atomic_read(&bad-
>>karma) >
>>>>>>>     					    bad->sched->hang_limit)
>>>>>>>     						if (entity->guilty)
>>>>>>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct
>drm_gpu_scheduler
>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>     	 * This iteration is thread safe as sched thread is stopped.
>>>>>>>     	 */
>>>>>>>     	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>>> ring_mirror_list, node) {
>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>     		    dma_fence_remove_callback(s_job->s_fence-
>>parent,
>>>>>>>     					      &s_job->cb)) {
>>>>>>>     			atomic_dec(&sched->hw_rq_count); @@ -
>395,7
>>>>> +395,8 @@ void
>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>     			 *
>>>>>>>     			 * Job is still alive so fence refcount at least 1
>>>>>>>     			 */
>>>>>>> -			dma_fence_wait(&s_job->s_fence->finished,
>false);
>>>>>>> +			if (s_job->s_fence)
>>>>>>> +				dma_fence_wait(&s_job->s_fence-
>>finished,
>>>>>> false);
>>>>>>>     			/*
>>>>>>>     			 * We must keep bad job alive for later use
>during @@
>>>>>> -438,7
>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched,
>>>>>>> +bool
>>>>>> full_recovery)
>>>>>>>     	 * GPU recovers can't run in parallel.
>>>>>>>     	 */
>>>>>>>     	list_for_each_entry_safe(s_job, tmp,
>>>>>>> &sched->ring_mirror_list,
>>>>>>> node)
>>>>>> {
>>>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job-
>>s_fence-
>>>>>>> parent :
>>>>>>> +NULL;
>>>>>>>
>>>>>>>     		atomic_inc(&sched->hw_rq_count);
>>>>>>>
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 10:14                                 ` Koenig, Christian
  0 siblings, 0 replies; 80+ messages in thread
From: Koenig, Christian @ 2019-11-08 10:14 UTC (permalink / raw)
  To: Deng, Emily, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Emily,

in this case you are on an old code branch.

Jobs are freed now by the main scheduler thread and only if no timeout 
handler is running.

See this patch here:
> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
> Author: Christian König <christian.koenig@amd.com>
> Date:   Thu Apr 18 11:00:21 2019 -0400
>
>     drm/scheduler: rework job destruction

Regards,
Christian.

Am 08.11.19 um 11:11 schrieb Deng, Emily:
> Hi Christian,
>       Please refer to follow log, when it enter to amdgpu_device_gpu_recover function, the bad job 000000005086879e is freeing in function  amdgpu_job_free_cb  at the same time, because of the hardware fence signal. But amdgpu_device_gpu_recover goes faster, at this case, the s_fence is already freed, but job is not freed in time. Then this issue occurs.
>
> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0 timeout, signaled seq=2481, emitted seq=2483
> [  449.793202] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: process  pid 0 thread  pid 0, s_job:000000005086879e
> [  449.794163] amdgpu 0000:00:08.0: GPU reset begin!
> [  449.794175] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:000000005086879e
> [  449.794221] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:0000000066eb74ab
> [  449.794222] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000d4438ad9
> [  449.794255] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000b6d69c65
> [  449.794257] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000ea85e922
> [  449.794287] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6
> [  449.794366] BUG: unable to handle kernel NULL pointer dereference at 00000000000000c0
> [  449.800818] PGD 0 P4D 0
> [  449.801040] Oops: 0000 [#1] SMP PTI
> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE     4.18.0-15-generic #16~18.04.1-Ubuntu
> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
> [  449.802944] Workqueue: events drm_sched_job_timedout [amd_sched]
> [  449.803488] RIP: 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff ff 45 85 e4 0f 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10 <48> 8b 98 c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286
> [  449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> [  449.806625] RDX: ffffb4c7c08f5ac0 RSI: 0000000fffffffe0 RDI: 0000000000000246
> [  449.807224] RBP: ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000
> [  449.807818] R10: 0000000000000000 R11: 0000000000000148 R12: 0000000000000000
> [  449.808411] R13: ffffb4c7c08f7da0 R14: ffff8d82b8525d40 R15: ffff8d82b8525d40
> [  449.809004] FS:  0000000000000000(0000) GS:ffff8d82bfd80000(0000) knlGS:0000000000000000
> [  449.809674] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  449.810153] CR2: 00000000000000c0 CR3: 000000003cc0a001 CR4: 00000000003606e0
> [  449.810747] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  449.811937] Call Trace:
> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu]
> [  449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched]
> [  449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu]
> [  449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched]
> [  449.814077]  process_one_work+0x1fd/0x3f0
> [  449.814417]  worker_thread+0x34/0x410
> [  449.814728]  kthread+0x121/0x140
> [  449.815004]  ? process_one_work+0x3f0/0x3f0
> [  449.815374]  ? kthread_create_worker_on_cpu+0x70/0x70
> [  449.815799]  ret_from_fork+0x35/0x40
>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Friday, November 8, 2019 5:43 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>> Sorry, please take your time.
>> Have you seen my other response a bit below?
>>
>> I can't follow how it would be possible for job->s_fence to be NULL without
>> the job also being freed.
>>
>> So it looks like this patch is just papering over some bigger issues.
>>
>> Regards,
>> Christian.
>>
>>> Best wishes
>>> Emily Deng
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>> Ping.....
>>>> You need to give me at least enough time to wake up :)
>>>>
>>>>> Best wishes
>>>>> Emily Deng
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>>>>> Deng, Emily
>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>> gfx@lists.freedesktop.org
>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> tdr
>>>>>>>
>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>> When the job is already signaled, the s_fence is freed. Then it
>>>>>>>> will has null pointer in amdgpu_device_gpu_recover.
>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>> See drm_sched_job_cleanup().
>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one case,
>>>>>> when it enter into the amdgpu_device_gpu_recover, it already in
>>>>>> drm_sched_job_cleanup, and at this time, it will go to free job.
>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At that
>>>>>> time, job is not freed, but s_fence is already NULL.
>>>> No, that case can't happen. See here:
>>>>
>>>>>           drm_sched_job_cleanup(s_job);
>>>>>
>>>>>           amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>           dma_fence_put(job->fence);
>>>>>           amdgpu_sync_free(&job->sync);
>>>>>           amdgpu_sync_free(&job->sched_sync);
>>>>>           kfree(job);
>>>> The job itself is freed up directly after freeing the reference to the s_fence.
>>>>
>>>> So you are just papering over a much bigger problem here. This patch
>>>> is a clear NAK.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>>> When you see a job without an s_fence then that means the problem
>>>>>>> is somewhere else.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>> ---
>>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>      drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>>>      2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>>      	 *
>>>>>>>>      	 * job->base holds a reference to parent fence
>>>>>>>>      	 */
>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent
>> &&
>>>>>>>>      	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>      		job_signaled = true;
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>>>> drm_sched_job
>>>>>>>> *bad)
>>>>>>>>
>>>>>>>>      			spin_lock(&rq->lock);
>>>>>>>>      			list_for_each_entry_safe(entity, tmp, &rq-
>>> entities,
>>>>>>> list) {
>>>>>>>> -				if (bad->s_fence->scheduled.context
>> ==
>>>>>>>> -				    entity->fence_context) {
>>>>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>>>>> scheduled.context ==
>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>      					if (atomic_read(&bad-
>>> karma) >
>>>>>>>>      					    bad->sched->hang_limit)
>>>>>>>>      						if (entity->guilty)
>>>>>>>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct
>> drm_gpu_scheduler
>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>      	 * This iteration is thread safe as sched thread is stopped.
>>>>>>>>      	 */
>>>>>>>>      	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>>>> ring_mirror_list, node) {
>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>>      		    dma_fence_remove_callback(s_job->s_fence-
>>> parent,
>>>>>>>>      					      &s_job->cb)) {
>>>>>>>>      			atomic_dec(&sched->hw_rq_count); @@ -
>> 395,7
>>>>>> +395,8 @@ void
>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>      			 *
>>>>>>>>      			 * Job is still alive so fence refcount at least 1
>>>>>>>>      			 */
>>>>>>>> -			dma_fence_wait(&s_job->s_fence->finished,
>> false);
>>>>>>>> +			if (s_job->s_fence)
>>>>>>>> +				dma_fence_wait(&s_job->s_fence-
>>> finished,
>>>>>>> false);
>>>>>>>>      			/*
>>>>>>>>      			 * We must keep bad job alive for later use
>> during @@
>>>>>>> -438,7
>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched,
>>>>>>>> +bool
>>>>>>> full_recovery)
>>>>>>>>      	 * GPU recovers can't run in parallel.
>>>>>>>>      	 */
>>>>>>>>      	list_for_each_entry_safe(s_job, tmp,
>>>>>>>> &sched->ring_mirror_list,
>>>>>>>> node)
>>>>>>> {
>>>>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job-
>>> s_fence-
>>>>>>>> parent :
>>>>>>>> +NULL;
>>>>>>>>
>>>>>>>>      		atomic_inc(&sched->hw_rq_count);
>>>>>>>>
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 10:14                                 ` Koenig, Christian
  0 siblings, 0 replies; 80+ messages in thread
From: Koenig, Christian @ 2019-11-08 10:14 UTC (permalink / raw)
  To: Deng, Emily, amd-gfx

Hi Emily,

in this case you are on an old code branch.

Jobs are freed now by the main scheduler thread and only if no timeout 
handler is running.

See this patch here:
> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
> Author: Christian König <christian.koenig@amd.com>
> Date:   Thu Apr 18 11:00:21 2019 -0400
>
>     drm/scheduler: rework job destruction

Regards,
Christian.

Am 08.11.19 um 11:11 schrieb Deng, Emily:
> Hi Christian,
>       Please refer to follow log, when it enter to amdgpu_device_gpu_recover function, the bad job 000000005086879e is freeing in function  amdgpu_job_free_cb  at the same time, because of the hardware fence signal. But amdgpu_device_gpu_recover goes faster, at this case, the s_fence is already freed, but job is not freed in time. Then this issue occurs.
>
> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0 timeout, signaled seq=2481, emitted seq=2483
> [  449.793202] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: process  pid 0 thread  pid 0, s_job:000000005086879e
> [  449.794163] amdgpu 0000:00:08.0: GPU reset begin!
> [  449.794175] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:000000005086879e
> [  449.794221] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:0000000066eb74ab
> [  449.794222] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000d4438ad9
> [  449.794255] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000b6d69c65
> [  449.794257] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000ea85e922
> [  449.794287] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6
> [  449.794366] BUG: unable to handle kernel NULL pointer dereference at 00000000000000c0
> [  449.800818] PGD 0 P4D 0
> [  449.801040] Oops: 0000 [#1] SMP PTI
> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE     4.18.0-15-generic #16~18.04.1-Ubuntu
> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
> [  449.802944] Workqueue: events drm_sched_job_timedout [amd_sched]
> [  449.803488] RIP: 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff ff 45 85 e4 0f 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10 <48> 8b 98 c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286
> [  449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> [  449.806625] RDX: ffffb4c7c08f5ac0 RSI: 0000000fffffffe0 RDI: 0000000000000246
> [  449.807224] RBP: ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000
> [  449.807818] R10: 0000000000000000 R11: 0000000000000148 R12: 0000000000000000
> [  449.808411] R13: ffffb4c7c08f7da0 R14: ffff8d82b8525d40 R15: ffff8d82b8525d40
> [  449.809004] FS:  0000000000000000(0000) GS:ffff8d82bfd80000(0000) knlGS:0000000000000000
> [  449.809674] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  449.810153] CR2: 00000000000000c0 CR3: 000000003cc0a001 CR4: 00000000003606e0
> [  449.810747] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  449.811937] Call Trace:
> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu]
> [  449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched]
> [  449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu]
> [  449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched]
> [  449.814077]  process_one_work+0x1fd/0x3f0
> [  449.814417]  worker_thread+0x34/0x410
> [  449.814728]  kthread+0x121/0x140
> [  449.815004]  ? process_one_work+0x3f0/0x3f0
> [  449.815374]  ? kthread_create_worker_on_cpu+0x70/0x70
> [  449.815799]  ret_from_fork+0x35/0x40
>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Friday, November 8, 2019 5:43 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>> Sorry, please take your time.
>> Have you seen my other response a bit below?
>>
>> I can't follow how it would be possible for job->s_fence to be NULL without
>> the job also being freed.
>>
>> So it looks like this patch is just papering over some bigger issues.
>>
>> Regards,
>> Christian.
>>
>>> Best wishes
>>> Emily Deng
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>> Ping.....
>>>> You need to give me at least enough time to wake up :)
>>>>
>>>>> Best wishes
>>>>> Emily Deng
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>>>>> Deng, Emily
>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>> gfx@lists.freedesktop.org
>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> tdr
>>>>>>>
>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>> When the job is already signaled, the s_fence is freed. Then it
>>>>>>>> will has null pointer in amdgpu_device_gpu_recover.
>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>> See drm_sched_job_cleanup().
>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one case,
>>>>>> when it enter into the amdgpu_device_gpu_recover, it already in
>>>>>> drm_sched_job_cleanup, and at this time, it will go to free job.
>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At that
>>>>>> time, job is not freed, but s_fence is already NULL.
>>>> No, that case can't happen. See here:
>>>>
>>>>>           drm_sched_job_cleanup(s_job);
>>>>>
>>>>>           amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>           dma_fence_put(job->fence);
>>>>>           amdgpu_sync_free(&job->sync);
>>>>>           amdgpu_sync_free(&job->sched_sync);
>>>>>           kfree(job);
>>>> The job itself is freed up directly after freeing the reference to the s_fence.
>>>>
>>>> So you are just papering over a much bigger problem here. This patch
>>>> is a clear NAK.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>>> When you see a job without an s_fence then that means the problem
>>>>>>> is somewhere else.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>> ---
>>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>      drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>>>      2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>>      	 *
>>>>>>>>      	 * job->base holds a reference to parent fence
>>>>>>>>      	 */
>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent
>> &&
>>>>>>>>      	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>      		job_signaled = true;
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>>>> drm_sched_job
>>>>>>>> *bad)
>>>>>>>>
>>>>>>>>      			spin_lock(&rq->lock);
>>>>>>>>      			list_for_each_entry_safe(entity, tmp, &rq-
>>> entities,
>>>>>>> list) {
>>>>>>>> -				if (bad->s_fence->scheduled.context
>> ==
>>>>>>>> -				    entity->fence_context) {
>>>>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>>>>> scheduled.context ==
>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>      					if (atomic_read(&bad-
>>> karma) >
>>>>>>>>      					    bad->sched->hang_limit)
>>>>>>>>      						if (entity->guilty)
>>>>>>>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct
>> drm_gpu_scheduler
>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>      	 * This iteration is thread safe as sched thread is stopped.
>>>>>>>>      	 */
>>>>>>>>      	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>>>> ring_mirror_list, node) {
>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>>      		    dma_fence_remove_callback(s_job->s_fence-
>>> parent,
>>>>>>>>      					      &s_job->cb)) {
>>>>>>>>      			atomic_dec(&sched->hw_rq_count); @@ -
>> 395,7
>>>>>> +395,8 @@ void
>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>      			 *
>>>>>>>>      			 * Job is still alive so fence refcount at least 1
>>>>>>>>      			 */
>>>>>>>> -			dma_fence_wait(&s_job->s_fence->finished,
>> false);
>>>>>>>> +			if (s_job->s_fence)
>>>>>>>> +				dma_fence_wait(&s_job->s_fence-
>>> finished,
>>>>>>> false);
>>>>>>>>      			/*
>>>>>>>>      			 * We must keep bad job alive for later use
>> during @@
>>>>>>> -438,7
>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched,
>>>>>>>> +bool
>>>>>>> full_recovery)
>>>>>>>>      	 * GPU recovers can't run in parallel.
>>>>>>>>      	 */
>>>>>>>>      	list_for_each_entry_safe(s_job, tmp,
>>>>>>>> &sched->ring_mirror_list,
>>>>>>>> node)
>>>>>>> {
>>>>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job-
>>> s_fence-
>>>>>>>> parent :
>>>>>>>> +NULL;
>>>>>>>>
>>>>>>>>      		atomic_inc(&sched->hw_rq_count);
>>>>>>>>
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 10:22                                     ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-08 10:22 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Chrisitan,
     No, I am with the new branch and also has the patch. Even it are freed by main scheduler, how we could avoid main scheduler to free jobs while enter to function amdgpu_device_gpu_recover?

Best wishes
Emily Deng

  

>-----Original Message-----
>From: Koenig, Christian <Christian.Koenig@amd.com>
>Sent: Friday, November 8, 2019 6:15 PM
>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Hi Emily,
>
>in this case you are on an old code branch.
>
>Jobs are freed now by the main scheduler thread and only if no timeout
>handler is running.
>
>See this patch here:
>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>> Author: Christian König <christian.koenig@amd.com>
>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>
>>     drm/scheduler: rework job destruction
>
>Regards,
>Christian.
>
>Am 08.11.19 um 11:11 schrieb Deng, Emily:
>> Hi Christian,
>>       Please refer to follow log, when it enter to amdgpu_device_gpu_recover
>function, the bad job 000000005086879e is freeing in function
>amdgpu_job_free_cb  at the same time, because of the hardware fence signal.
>But amdgpu_device_gpu_recover goes faster, at this case, the s_fence is
>already freed, but job is not freed in time. Then this issue occurs.
>>
>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0
>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163] amdgpu
>0000:00:08.0: GPU reset begin!
>> [  449.794175] Emily:amdgpu_job_free_cb,Process information: process
>> pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>> pid 0, s_job:0000000066eb74ab [  449.794222]
>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>> pid 0, s_job:00000000d4438ad9 [  449.794255]
>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>> pid 0, s_job:00000000b6d69c65 [  449.794257]
>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0,
>s_job:00000000ea85e922 [  449.794287] Emily:amdgpu_job_free_cb,Process
>information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6
>[  449.794366] BUG: unable to handle kernel NULL pointer dereference at
>00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops: 0000
>[#1] SMP PTI
>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>4.18.0-15-generic #16~18.04.1-Ubuntu
>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
>> BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944] Workqueue: events
>> drm_sched_job_timedout [amd_sched] [  449.803488] RIP:
>0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff ff 45 85 e4 0f
>85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10 <48> 8b 98
>c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000 CR0:
>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>[  449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>0000000000000400 [  449.811937] Call Trace:
>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>> kthread_create_worker_on_cpu+0x70/0x70
>> [  449.815799]  ret_from_fork+0x35/0x40
>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Friday, November 8, 2019 5:43 PM
>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>> Sorry, please take your time.
>>> Have you seen my other response a bit below?
>>>
>>> I can't follow how it would be possible for job->s_fence to be NULL
>>> without the job also being freed.
>>>
>>> So it looks like this patch is just papering over some bigger issues.
>>>
>>> Regards,
>>> Christian.
>>>
>>>> Best wishes
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>> Ping.....
>>>>> You need to give me at least enough time to wake up :)
>>>>>
>>>>>> Best wishes
>>>>>> Emily Deng
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf
>>>>>>> Of Deng, Emily
>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>> gfx@lists.freedesktop.org
>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> tdr
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>> tdr
>>>>>>>>
>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>> When the job is already signaled, the s_fence is freed. Then it
>>>>>>>>> will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>> See drm_sched_job_cleanup().
>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>> already in drm_sched_job_cleanup, and at this time, it will go to free
>job.
>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At that
>>>>>>> time, job is not freed, but s_fence is already NULL.
>>>>> No, that case can't happen. See here:
>>>>>
>>>>>>           drm_sched_job_cleanup(s_job);
>>>>>>
>>>>>>           amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>           dma_fence_put(job->fence);
>>>>>>           amdgpu_sync_free(&job->sync);
>>>>>>           amdgpu_sync_free(&job->sched_sync);
>>>>>>           kfree(job);
>>>>> The job itself is freed up directly after freeing the reference to the
>s_fence.
>>>>>
>>>>> So you are just papering over a much bigger problem here. This
>>>>> patch is a clear NAK.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>> problem is somewhere else.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>> ---
>>>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>      drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>>>>      2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>>      	 *
>>>>>>>>>      	 * job->base holds a reference to parent fence
>>>>>>>>>      	 */
>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent
>>> &&
>>>>>>>>>      	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>      		job_signaled = true;
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>>>>> drm_sched_job
>>>>>>>>> *bad)
>>>>>>>>>
>>>>>>>>>      			spin_lock(&rq->lock);
>>>>>>>>>      			list_for_each_entry_safe(entity, tmp, &rq-
>>>> entities,
>>>>>>>> list) {
>>>>>>>>> -				if (bad->s_fence->scheduled.context
>>> ==
>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>>>>>> scheduled.context ==
>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>      					if (atomic_read(&bad-
>>>> karma) >
>>>>>>>>>      					    bad->sched->hang_limit)
>>>>>>>>>      						if (entity->guilty)
>>>>>>>>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct
>>> drm_gpu_scheduler
>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>      	 * This iteration is thread safe as sched thread is stopped.
>>>>>>>>>      	 */
>>>>>>>>>      	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>>>>> ring_mirror_list, node) {
>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>>>      		    dma_fence_remove_callback(s_job->s_fence-
>>>> parent,
>>>>>>>>>      					      &s_job->cb)) {
>>>>>>>>>      			atomic_dec(&sched->hw_rq_count); @@ -
>>> 395,7
>>>>>>> +395,8 @@ void
>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>      			 *
>>>>>>>>>      			 * Job is still alive so fence refcount at least 1
>>>>>>>>>      			 */
>>>>>>>>> -			dma_fence_wait(&s_job->s_fence->finished,
>>> false);
>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>> +				dma_fence_wait(&s_job->s_fence-
>>>> finished,
>>>>>>>> false);
>>>>>>>>>      			/*
>>>>>>>>>      			 * We must keep bad job alive for later use
>>> during @@
>>>>>>>> -438,7
>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>*sched,
>>>>>>>>> +bool
>>>>>>>> full_recovery)
>>>>>>>>>      	 * GPU recovers can't run in parallel.
>>>>>>>>>      	 */
>>>>>>>>>      	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>> node)
>>>>>>>> {
>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job-
>>>> s_fence-
>>>>>>>>> parent :
>>>>>>>>> +NULL;
>>>>>>>>>
>>>>>>>>>      		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>
>>>>>>> _______________________________________________
>>>>>>> amd-gfx mailing list
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 10:22                                     ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-08 10:22 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx

Hi Chrisitan,
     No, I am with the new branch and also has the patch. Even it are freed by main scheduler, how we could avoid main scheduler to free jobs while enter to function amdgpu_device_gpu_recover?

Best wishes
Emily Deng

  

>-----Original Message-----
>From: Koenig, Christian <Christian.Koenig@amd.com>
>Sent: Friday, November 8, 2019 6:15 PM
>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Hi Emily,
>
>in this case you are on an old code branch.
>
>Jobs are freed now by the main scheduler thread and only if no timeout
>handler is running.
>
>See this patch here:
>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>> Author: Christian König <christian.koenig@amd.com>
>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>
>>     drm/scheduler: rework job destruction
>
>Regards,
>Christian.
>
>Am 08.11.19 um 11:11 schrieb Deng, Emily:
>> Hi Christian,
>>       Please refer to follow log, when it enter to amdgpu_device_gpu_recover
>function, the bad job 000000005086879e is freeing in function
>amdgpu_job_free_cb  at the same time, because of the hardware fence signal.
>But amdgpu_device_gpu_recover goes faster, at this case, the s_fence is
>already freed, but job is not freed in time. Then this issue occurs.
>>
>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0
>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163] amdgpu
>0000:00:08.0: GPU reset begin!
>> [  449.794175] Emily:amdgpu_job_free_cb,Process information: process
>> pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>> pid 0, s_job:0000000066eb74ab [  449.794222]
>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>> pid 0, s_job:00000000d4438ad9 [  449.794255]
>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>> pid 0, s_job:00000000b6d69c65 [  449.794257]
>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0,
>s_job:00000000ea85e922 [  449.794287] Emily:amdgpu_job_free_cb,Process
>information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6
>[  449.794366] BUG: unable to handle kernel NULL pointer dereference at
>00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops: 0000
>[#1] SMP PTI
>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>4.18.0-15-generic #16~18.04.1-Ubuntu
>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
>> BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944] Workqueue: events
>> drm_sched_job_timedout [amd_sched] [  449.803488] RIP:
>0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff ff 45 85 e4 0f
>85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10 <48> 8b 98
>c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000 CR0:
>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>[  449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>0000000000000400 [  449.811937] Call Trace:
>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>> kthread_create_worker_on_cpu+0x70/0x70
>> [  449.815799]  ret_from_fork+0x35/0x40
>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Friday, November 8, 2019 5:43 PM
>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>> Sorry, please take your time.
>>> Have you seen my other response a bit below?
>>>
>>> I can't follow how it would be possible for job->s_fence to be NULL
>>> without the job also being freed.
>>>
>>> So it looks like this patch is just papering over some bigger issues.
>>>
>>> Regards,
>>> Christian.
>>>
>>>> Best wishes
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>> Ping.....
>>>>> You need to give me at least enough time to wake up :)
>>>>>
>>>>>> Best wishes
>>>>>> Emily Deng
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf
>>>>>>> Of Deng, Emily
>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>> gfx@lists.freedesktop.org
>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> tdr
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>> tdr
>>>>>>>>
>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>> When the job is already signaled, the s_fence is freed. Then it
>>>>>>>>> will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>> See drm_sched_job_cleanup().
>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>> already in drm_sched_job_cleanup, and at this time, it will go to free
>job.
>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At that
>>>>>>> time, job is not freed, but s_fence is already NULL.
>>>>> No, that case can't happen. See here:
>>>>>
>>>>>>           drm_sched_job_cleanup(s_job);
>>>>>>
>>>>>>           amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>           dma_fence_put(job->fence);
>>>>>>           amdgpu_sync_free(&job->sync);
>>>>>>           amdgpu_sync_free(&job->sched_sync);
>>>>>>           kfree(job);
>>>>> The job itself is freed up directly after freeing the reference to the
>s_fence.
>>>>>
>>>>> So you are just papering over a much bigger problem here. This
>>>>> patch is a clear NAK.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>> problem is somewhere else.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>> ---
>>>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>      drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>>>>      2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>>      	 *
>>>>>>>>>      	 * job->base holds a reference to parent fence
>>>>>>>>>      	 */
>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent
>>> &&
>>>>>>>>>      	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>      		job_signaled = true;
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>>>>> drm_sched_job
>>>>>>>>> *bad)
>>>>>>>>>
>>>>>>>>>      			spin_lock(&rq->lock);
>>>>>>>>>      			list_for_each_entry_safe(entity, tmp, &rq-
>>>> entities,
>>>>>>>> list) {
>>>>>>>>> -				if (bad->s_fence->scheduled.context
>>> ==
>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>>>>>> scheduled.context ==
>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>      					if (atomic_read(&bad-
>>>> karma) >
>>>>>>>>>      					    bad->sched->hang_limit)
>>>>>>>>>      						if (entity->guilty)
>>>>>>>>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct
>>> drm_gpu_scheduler
>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>      	 * This iteration is thread safe as sched thread is stopped.
>>>>>>>>>      	 */
>>>>>>>>>      	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>>>>> ring_mirror_list, node) {
>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>>>      		    dma_fence_remove_callback(s_job->s_fence-
>>>> parent,
>>>>>>>>>      					      &s_job->cb)) {
>>>>>>>>>      			atomic_dec(&sched->hw_rq_count); @@ -
>>> 395,7
>>>>>>> +395,8 @@ void
>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>      			 *
>>>>>>>>>      			 * Job is still alive so fence refcount at least 1
>>>>>>>>>      			 */
>>>>>>>>> -			dma_fence_wait(&s_job->s_fence->finished,
>>> false);
>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>> +				dma_fence_wait(&s_job->s_fence-
>>>> finished,
>>>>>>>> false);
>>>>>>>>>      			/*
>>>>>>>>>      			 * We must keep bad job alive for later use
>>> during @@
>>>>>>>> -438,7
>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>*sched,
>>>>>>>>> +bool
>>>>>>>> full_recovery)
>>>>>>>>>      	 * GPU recovers can't run in parallel.
>>>>>>>>>      	 */
>>>>>>>>>      	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>> node)
>>>>>>>> {
>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job-
>>>> s_fence-
>>>>>>>>> parent :
>>>>>>>>> +NULL;
>>>>>>>>>
>>>>>>>>>      		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>
>>>>>>> _______________________________________________
>>>>>>> amd-gfx mailing list
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 10:26                                         ` Koenig, Christian
  0 siblings, 0 replies; 80+ messages in thread
From: Koenig, Christian @ 2019-11-08 10:26 UTC (permalink / raw)
  To: Deng, Emily, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Emily,

well who is calling amdgpu_device_gpu_recover() in this case?

When it's not the scheduler we shouldn't have a guilty job in the first 
place.

Regards,
Christian.

Am 08.11.19 um 11:22 schrieb Deng, Emily:
> Hi Chrisitan,
>       No, I am with the new branch and also has the patch. Even it are freed by main scheduler, how we could avoid main scheduler to free jobs while enter to function amdgpu_device_gpu_recover?
>
> Best wishes
> Emily Deng
>
>    
>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Friday, November 8, 2019 6:15 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>> Hi Emily,
>>
>> in this case you are on an old code branch.
>>
>> Jobs are freed now by the main scheduler thread and only if no timeout
>> handler is running.
>>
>> See this patch here:
>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>> Author: Christian König <christian.koenig@amd.com>
>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>
>>>      drm/scheduler: rework job destruction
>> Regards,
>> Christian.
>>
>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>> Hi Christian,
>>>        Please refer to follow log, when it enter to amdgpu_device_gpu_recover
>> function, the bad job 000000005086879e is freeing in function
>> amdgpu_job_free_cb  at the same time, because of the hardware fence signal.
>> But amdgpu_device_gpu_recover goes faster, at this case, the s_fence is
>> already freed, but job is not freed in time. Then this issue occurs.
>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0
>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163] amdgpu
>> 0000:00:08.0: GPU reset begin!
>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information: process
>>> pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>> pid 0, s_job:0000000066eb74ab [  449.794222]
>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>> pid 0, s_job:00000000d4438ad9 [  449.794255]
>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>> pid 0, s_job:00000000b6d69c65 [  449.794257]
>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0,
>> s_job:00000000ea85e922 [  449.794287] Emily:amdgpu_job_free_cb,Process
>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6
>> [  449.794366] BUG: unable to handle kernel NULL pointer dereference at
>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops: 0000
>> [#1] SMP PTI
>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
>>> BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944] Workqueue: events
>>> drm_sched_job_timedout [amd_sched] [  449.803488] RIP:
>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff ff 45 85 e4 0f
>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10 <48> 8b 98
>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000 CR0:
>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>> [  449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>> 0000000000000400 [  449.811937] Call Trace:
>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>> kthread_create_worker_on_cpu+0x70/0x70
>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>> Sorry, please take your time.
>>>> Have you seen my other response a bit below?
>>>>
>>>> I can't follow how it would be possible for job->s_fence to be NULL
>>>> without the job also being freed.
>>>>
>>>> So it looks like this patch is just papering over some bigger issues.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Best wishes
>>>>> Emily Deng
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>> gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>
>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>> Ping.....
>>>>>> You need to give me at least enough time to wake up :)
>>>>>>
>>>>>>> Best wishes
>>>>>>> Emily Deng
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf
>>>>>>>> Of Deng, Emily
>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>> tdr
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>> tdr
>>>>>>>>>
>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>> When the job is already signaled, the s_fence is freed. Then it
>>>>>>>>>> will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will go to free
>> job.
>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At that
>>>>>>>> time, job is not freed, but s_fence is already NULL.
>>>>>> No, that case can't happen. See here:
>>>>>>
>>>>>>>            drm_sched_job_cleanup(s_job);
>>>>>>>
>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>            dma_fence_put(job->fence);
>>>>>>>            amdgpu_sync_free(&job->sync);
>>>>>>>            amdgpu_sync_free(&job->sched_sync);
>>>>>>>            kfree(job);
>>>>>> The job itself is freed up directly after freeing the reference to the
>> s_fence.
>>>>>> So you are just papering over a much bigger problem here. This
>>>>>> patch is a clear NAK.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>> problem is somewhere else.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>> ---
>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>       	 *
>>>>>>>>>>       	 * job->base holds a reference to parent fence
>>>>>>>>>>       	 */
>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent
>>>> &&
>>>>>>>>>>       	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>       		job_signaled = true;
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>>>>>> drm_sched_job
>>>>>>>>>> *bad)
>>>>>>>>>>
>>>>>>>>>>       			spin_lock(&rq->lock);
>>>>>>>>>>       			list_for_each_entry_safe(entity, tmp, &rq-
>>>>> entities,
>>>>>>>>> list) {
>>>>>>>>>> -				if (bad->s_fence->scheduled.context
>>>> ==
>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>>>>>>> scheduled.context ==
>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>       					if (atomic_read(&bad-
>>>>> karma) >
>>>>>>>>>>       					    bad->sched->hang_limit)
>>>>>>>>>>       						if (entity->guilty)
>>>>>>>>>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct
>>>> drm_gpu_scheduler
>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>       	 * This iteration is thread safe as sched thread is stopped.
>>>>>>>>>>       	 */
>>>>>>>>>>       	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>>>>>> ring_mirror_list, node) {
>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>>>>       		    dma_fence_remove_callback(s_job->s_fence-
>>>>> parent,
>>>>>>>>>>       					      &s_job->cb)) {
>>>>>>>>>>       			atomic_dec(&sched->hw_rq_count); @@ -
>>>> 395,7
>>>>>>>> +395,8 @@ void
>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>       			 *
>>>>>>>>>>       			 * Job is still alive so fence refcount at least 1
>>>>>>>>>>       			 */
>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence->finished,
>>>> false);
>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>> +				dma_fence_wait(&s_job->s_fence-
>>>>> finished,
>>>>>>>>> false);
>>>>>>>>>>       			/*
>>>>>>>>>>       			 * We must keep bad job alive for later use
>>>> during @@
>>>>>>>>> -438,7
>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>> *sched,
>>>>>>>>>> +bool
>>>>>>>>> full_recovery)
>>>>>>>>>>       	 * GPU recovers can't run in parallel.
>>>>>>>>>>       	 */
>>>>>>>>>>       	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>> node)
>>>>>>>>> {
>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job-
>>>>> s_fence-
>>>>>>>>>> parent :
>>>>>>>>>> +NULL;
>>>>>>>>>>
>>>>>>>>>>       		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>
>>>>>>>> _______________________________________________
>>>>>>>> amd-gfx mailing list
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 10:26                                         ` Koenig, Christian
  0 siblings, 0 replies; 80+ messages in thread
From: Koenig, Christian @ 2019-11-08 10:26 UTC (permalink / raw)
  To: Deng, Emily, amd-gfx

Hi Emily,

well who is calling amdgpu_device_gpu_recover() in this case?

When it's not the scheduler we shouldn't have a guilty job in the first 
place.

Regards,
Christian.

Am 08.11.19 um 11:22 schrieb Deng, Emily:
> Hi Chrisitan,
>       No, I am with the new branch and also has the patch. Even it are freed by main scheduler, how we could avoid main scheduler to free jobs while enter to function amdgpu_device_gpu_recover?
>
> Best wishes
> Emily Deng
>
>    
>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Friday, November 8, 2019 6:15 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>> Hi Emily,
>>
>> in this case you are on an old code branch.
>>
>> Jobs are freed now by the main scheduler thread and only if no timeout
>> handler is running.
>>
>> See this patch here:
>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>> Author: Christian König <christian.koenig@amd.com>
>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>
>>>      drm/scheduler: rework job destruction
>> Regards,
>> Christian.
>>
>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>> Hi Christian,
>>>        Please refer to follow log, when it enter to amdgpu_device_gpu_recover
>> function, the bad job 000000005086879e is freeing in function
>> amdgpu_job_free_cb  at the same time, because of the hardware fence signal.
>> But amdgpu_device_gpu_recover goes faster, at this case, the s_fence is
>> already freed, but job is not freed in time. Then this issue occurs.
>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0
>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163] amdgpu
>> 0000:00:08.0: GPU reset begin!
>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information: process
>>> pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>> pid 0, s_job:0000000066eb74ab [  449.794222]
>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>> pid 0, s_job:00000000d4438ad9 [  449.794255]
>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>> pid 0, s_job:00000000b6d69c65 [  449.794257]
>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0,
>> s_job:00000000ea85e922 [  449.794287] Emily:amdgpu_job_free_cb,Process
>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6
>> [  449.794366] BUG: unable to handle kernel NULL pointer dereference at
>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops: 0000
>> [#1] SMP PTI
>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
>>> BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944] Workqueue: events
>>> drm_sched_job_timedout [amd_sched] [  449.803488] RIP:
>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff ff 45 85 e4 0f
>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10 <48> 8b 98
>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000 CR0:
>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>> [  449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>> 0000000000000400 [  449.811937] Call Trace:
>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>> kthread_create_worker_on_cpu+0x70/0x70
>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>> Sorry, please take your time.
>>>> Have you seen my other response a bit below?
>>>>
>>>> I can't follow how it would be possible for job->s_fence to be NULL
>>>> without the job also being freed.
>>>>
>>>> So it looks like this patch is just papering over some bigger issues.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Best wishes
>>>>> Emily Deng
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>> gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>
>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>> Ping.....
>>>>>> You need to give me at least enough time to wake up :)
>>>>>>
>>>>>>> Best wishes
>>>>>>> Emily Deng
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf
>>>>>>>> Of Deng, Emily
>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>> tdr
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>> tdr
>>>>>>>>>
>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>> When the job is already signaled, the s_fence is freed. Then it
>>>>>>>>>> will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will go to free
>> job.
>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At that
>>>>>>>> time, job is not freed, but s_fence is already NULL.
>>>>>> No, that case can't happen. See here:
>>>>>>
>>>>>>>            drm_sched_job_cleanup(s_job);
>>>>>>>
>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>            dma_fence_put(job->fence);
>>>>>>>            amdgpu_sync_free(&job->sync);
>>>>>>>            amdgpu_sync_free(&job->sched_sync);
>>>>>>>            kfree(job);
>>>>>> The job itself is freed up directly after freeing the reference to the
>> s_fence.
>>>>>> So you are just papering over a much bigger problem here. This
>>>>>> patch is a clear NAK.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>> problem is somewhere else.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>> ---
>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>> @@ -4075,7 +4075,7 @@ int amdgpu_device_gpu_recover(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>       	 *
>>>>>>>>>>       	 * job->base holds a reference to parent fence
>>>>>>>>>>       	 */
>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent
>>>> &&
>>>>>>>>>>       	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>       		job_signaled = true;
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>>>>>> drm_sched_job
>>>>>>>>>> *bad)
>>>>>>>>>>
>>>>>>>>>>       			spin_lock(&rq->lock);
>>>>>>>>>>       			list_for_each_entry_safe(entity, tmp, &rq-
>>>>> entities,
>>>>>>>>> list) {
>>>>>>>>>> -				if (bad->s_fence->scheduled.context
>>>> ==
>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>>>>>>> scheduled.context ==
>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>       					if (atomic_read(&bad-
>>>>> karma) >
>>>>>>>>>>       					    bad->sched->hang_limit)
>>>>>>>>>>       						if (entity->guilty)
>>>>>>>>>> @@ -376,7 +376,7 @@ void drm_sched_stop(struct
>>>> drm_gpu_scheduler
>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>       	 * This iteration is thread safe as sched thread is stopped.
>>>>>>>>>>       	 */
>>>>>>>>>>       	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>>>>>> ring_mirror_list, node) {
>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>>>>       		    dma_fence_remove_callback(s_job->s_fence-
>>>>> parent,
>>>>>>>>>>       					      &s_job->cb)) {
>>>>>>>>>>       			atomic_dec(&sched->hw_rq_count); @@ -
>>>> 395,7
>>>>>>>> +395,8 @@ void
>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>       			 *
>>>>>>>>>>       			 * Job is still alive so fence refcount at least 1
>>>>>>>>>>       			 */
>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence->finished,
>>>> false);
>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>> +				dma_fence_wait(&s_job->s_fence-
>>>>> finished,
>>>>>>>>> false);
>>>>>>>>>>       			/*
>>>>>>>>>>       			 * We must keep bad job alive for later use
>>>> during @@
>>>>>>>>> -438,7
>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>> *sched,
>>>>>>>>>> +bool
>>>>>>>>> full_recovery)
>>>>>>>>>>       	 * GPU recovers can't run in parallel.
>>>>>>>>>>       	 */
>>>>>>>>>>       	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>> node)
>>>>>>>>> {
>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job-
>>>>> s_fence-
>>>>>>>>>> parent :
>>>>>>>>>> +NULL;
>>>>>>>>>>
>>>>>>>>>>       		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>
>>>>>>>> _______________________________________________
>>>>>>>> amd-gfx mailing list
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 10:32                                             ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-08 10:32 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Christian,
     The drm_sched_job_timedout-> amdgpu_job_timedout call amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while in amdgpu_device_gpu_recover, and before calling drm_sched_stop. 

Best wishes
Emily Deng



>-----Original Message-----
>From: Koenig, Christian <Christian.Koenig@amd.com>
>Sent: Friday, November 8, 2019 6:26 PM
>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Hi Emily,
>
>well who is calling amdgpu_device_gpu_recover() in this case?
>
>When it's not the scheduler we shouldn't have a guilty job in the first place.
>
>Regards,
>Christian.
>
>Am 08.11.19 um 11:22 schrieb Deng, Emily:
>> Hi Chrisitan,
>>       No, I am with the new branch and also has the patch. Even it are freed by
>main scheduler, how we could avoid main scheduler to free jobs while enter
>to function amdgpu_device_gpu_recover?
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Friday, November 8, 2019 6:15 PM
>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Hi Emily,
>>>
>>> in this case you are on an old code branch.
>>>
>>> Jobs are freed now by the main scheduler thread and only if no
>>> timeout handler is running.
>>>
>>> See this patch here:
>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>> Author: Christian König <christian.koenig@amd.com>
>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>
>>>>      drm/scheduler: rework job destruction
>>> Regards,
>>> Christian.
>>>
>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>> Hi Christian,
>>>>        Please refer to follow log, when it enter to
>>>> amdgpu_device_gpu_recover
>>> function, the bad job 000000005086879e is freeing in function
>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>signal.
>>> But amdgpu_device_gpu_recover goes faster, at this case, the s_fence
>>> is already freed, but job is not freed in time. Then this issue occurs.
>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>sdma0
>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>> amdgpu
>>> 0000:00:08.0: GPU reset begin!
>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information: process
>>>> pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>> pid 0, s_job:0000000066eb74ab [  449.794222]
>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>> pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>> pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>> pid 0,
>>> s_job:00000000ea85e922 [  449.794287]
>>> Emily:amdgpu_job_free_cb,Process
>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference at
>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops: 0000
>>> [#1] SMP PTI
>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [  449.803488]
>RIP:
>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff ff
>>>> 45 85 e4 0f
>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10 <48> 8b
>98
>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000 CR0:
>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>> 0000000000000400 [  449.811937] Call Trace:
>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>
>>>>> -----Original Message-----
>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>> Sorry, please take your time.
>>>>> Have you seen my other response a bit below?
>>>>>
>>>>> I can't follow how it would be possible for job->s_fence to be NULL
>>>>> without the job also being freed.
>>>>>
>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> Best wishes
>>>>>> Emily Deng
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>> gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> tdr
>>>>>>>
>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>> Ping.....
>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>
>>>>>>>> Best wishes
>>>>>>>> Emily Deng
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>Behalf
>>>>>>>>> Of Deng, Emily
>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>> tdr
>>>>>>>>>
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>> for tdr
>>>>>>>>>>
>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>> When the job is already signaled, the s_fence is freed. Then
>>>>>>>>>>> it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will go
>>>>>>>>> to free
>>> job.
>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At that
>>>>>>>>> time, job is not freed, but s_fence is already NULL.
>>>>>>> No, that case can't happen. See here:
>>>>>>>
>>>>>>>>            drm_sched_job_cleanup(s_job);
>>>>>>>>
>>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>            dma_fence_put(job->fence);
>>>>>>>>            amdgpu_sync_free(&job->sync);
>>>>>>>>            amdgpu_sync_free(&job->sched_sync);
>>>>>>>>            kfree(job);
>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>>> to the
>>> s_fence.
>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>> patch is a clear NAK.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>> ---
>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>amdgpu_device_gpu_recover(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>       	 *
>>>>>>>>>>>       	 * job->base holds a reference to parent fence
>>>>>>>>>>>       	 */
>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent
>>>>> &&
>>>>>>>>>>>       	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>       		job_signaled = true;
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>>>>>>> drm_sched_job
>>>>>>>>>>> *bad)
>>>>>>>>>>>
>>>>>>>>>>>       			spin_lock(&rq->lock);
>>>>>>>>>>>       			list_for_each_entry_safe(entity, tmp,
>&rq-
>>>>>> entities,
>>>>>>>>>> list) {
>>>>>>>>>>> -				if (bad->s_fence->scheduled.context
>>>>> ==
>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>       					if (atomic_read(&bad-
>>>>>> karma) >
>>>>>>>>>>>       					    bad->sched-
>>hang_limit)
>>>>>>>>>>>       						if (entity-
>>guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>> drm_sched_stop(struct
>>>>> drm_gpu_scheduler
>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>       	 * This iteration is thread safe as sched thread is
>stopped.
>>>>>>>>>>>       	 */
>>>>>>>>>>>       	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>>>>>>> ring_mirror_list, node) {
>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>>>>>       		    dma_fence_remove_callback(s_job-
>>s_fence-
>>>>>> parent,
>>>>>>>>>>>       					      &s_job->cb)) {
>>>>>>>>>>>       			atomic_dec(&sched->hw_rq_count);
>@@ -
>>>>> 395,7
>>>>>>>>> +395,8 @@ void
>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>       			 *
>>>>>>>>>>>       			 * Job is still alive so fence refcount at
>least 1
>>>>>>>>>>>       			 */
>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence->finished,
>>>>> false);
>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>> +				dma_fence_wait(&s_job->s_fence-
>>>>>> finished,
>>>>>>>>>> false);
>>>>>>>>>>>       			/*
>>>>>>>>>>>       			 * We must keep bad job alive for later
>use
>>>>> during @@
>>>>>>>>>> -438,7
>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>> *sched,
>>>>>>>>>>> +bool
>>>>>>>>>> full_recovery)
>>>>>>>>>>>       	 * GPU recovers can't run in parallel.
>>>>>>>>>>>       	 */
>>>>>>>>>>>       	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>> node)
>>>>>>>>>> {
>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job-
>>>>>> s_fence-
>>>>>>>>>>> parent :
>>>>>>>>>>> +NULL;
>>>>>>>>>>>
>>>>>>>>>>>       		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>
>>>>>>>>> _______________________________________________
>>>>>>>>> amd-gfx mailing list
>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 10:32                                             ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-08 10:32 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx

Hi Christian,
     The drm_sched_job_timedout-> amdgpu_job_timedout call amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while in amdgpu_device_gpu_recover, and before calling drm_sched_stop. 

Best wishes
Emily Deng



>-----Original Message-----
>From: Koenig, Christian <Christian.Koenig@amd.com>
>Sent: Friday, November 8, 2019 6:26 PM
>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Hi Emily,
>
>well who is calling amdgpu_device_gpu_recover() in this case?
>
>When it's not the scheduler we shouldn't have a guilty job in the first place.
>
>Regards,
>Christian.
>
>Am 08.11.19 um 11:22 schrieb Deng, Emily:
>> Hi Chrisitan,
>>       No, I am with the new branch and also has the patch. Even it are freed by
>main scheduler, how we could avoid main scheduler to free jobs while enter
>to function amdgpu_device_gpu_recover?
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Friday, November 8, 2019 6:15 PM
>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Hi Emily,
>>>
>>> in this case you are on an old code branch.
>>>
>>> Jobs are freed now by the main scheduler thread and only if no
>>> timeout handler is running.
>>>
>>> See this patch here:
>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>> Author: Christian König <christian.koenig@amd.com>
>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>
>>>>      drm/scheduler: rework job destruction
>>> Regards,
>>> Christian.
>>>
>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>> Hi Christian,
>>>>        Please refer to follow log, when it enter to
>>>> amdgpu_device_gpu_recover
>>> function, the bad job 000000005086879e is freeing in function
>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>signal.
>>> But amdgpu_device_gpu_recover goes faster, at this case, the s_fence
>>> is already freed, but job is not freed in time. Then this issue occurs.
>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>sdma0
>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>> amdgpu
>>> 0000:00:08.0: GPU reset begin!
>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information: process
>>>> pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>> pid 0, s_job:0000000066eb74ab [  449.794222]
>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>> pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>> pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>> pid 0,
>>> s_job:00000000ea85e922 [  449.794287]
>>> Emily:amdgpu_job_free_cb,Process
>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference at
>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops: 0000
>>> [#1] SMP PTI
>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [  449.803488]
>RIP:
>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff ff
>>>> 45 85 e4 0f
>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10 <48> 8b
>98
>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000 CR0:
>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>> 0000000000000400 [  449.811937] Call Trace:
>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>
>>>>> -----Original Message-----
>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>> Sorry, please take your time.
>>>>> Have you seen my other response a bit below?
>>>>>
>>>>> I can't follow how it would be possible for job->s_fence to be NULL
>>>>> without the job also being freed.
>>>>>
>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> Best wishes
>>>>>> Emily Deng
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>> gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> tdr
>>>>>>>
>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>> Ping.....
>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>
>>>>>>>> Best wishes
>>>>>>>> Emily Deng
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>Behalf
>>>>>>>>> Of Deng, Emily
>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>> tdr
>>>>>>>>>
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>> for tdr
>>>>>>>>>>
>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>> When the job is already signaled, the s_fence is freed. Then
>>>>>>>>>>> it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will go
>>>>>>>>> to free
>>> job.
>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At that
>>>>>>>>> time, job is not freed, but s_fence is already NULL.
>>>>>>> No, that case can't happen. See here:
>>>>>>>
>>>>>>>>            drm_sched_job_cleanup(s_job);
>>>>>>>>
>>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>            dma_fence_put(job->fence);
>>>>>>>>            amdgpu_sync_free(&job->sync);
>>>>>>>>            amdgpu_sync_free(&job->sched_sync);
>>>>>>>>            kfree(job);
>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>>> to the
>>> s_fence.
>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>> patch is a clear NAK.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>> ---
>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>amdgpu_device_gpu_recover(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>       	 *
>>>>>>>>>>>       	 * job->base holds a reference to parent fence
>>>>>>>>>>>       	 */
>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent
>>>>> &&
>>>>>>>>>>>       	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>       		job_signaled = true;
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>>>>>>> drm_sched_job
>>>>>>>>>>> *bad)
>>>>>>>>>>>
>>>>>>>>>>>       			spin_lock(&rq->lock);
>>>>>>>>>>>       			list_for_each_entry_safe(entity, tmp,
>&rq-
>>>>>> entities,
>>>>>>>>>> list) {
>>>>>>>>>>> -				if (bad->s_fence->scheduled.context
>>>>> ==
>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>       					if (atomic_read(&bad-
>>>>>> karma) >
>>>>>>>>>>>       					    bad->sched-
>>hang_limit)
>>>>>>>>>>>       						if (entity-
>>guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>> drm_sched_stop(struct
>>>>> drm_gpu_scheduler
>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>       	 * This iteration is thread safe as sched thread is
>stopped.
>>>>>>>>>>>       	 */
>>>>>>>>>>>       	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>>>>>>> ring_mirror_list, node) {
>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>>>>>       		    dma_fence_remove_callback(s_job-
>>s_fence-
>>>>>> parent,
>>>>>>>>>>>       					      &s_job->cb)) {
>>>>>>>>>>>       			atomic_dec(&sched->hw_rq_count);
>@@ -
>>>>> 395,7
>>>>>>>>> +395,8 @@ void
>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>       			 *
>>>>>>>>>>>       			 * Job is still alive so fence refcount at
>least 1
>>>>>>>>>>>       			 */
>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence->finished,
>>>>> false);
>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>> +				dma_fence_wait(&s_job->s_fence-
>>>>>> finished,
>>>>>>>>>> false);
>>>>>>>>>>>       			/*
>>>>>>>>>>>       			 * We must keep bad job alive for later
>use
>>>>> during @@
>>>>>>>>>> -438,7
>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>> *sched,
>>>>>>>>>>> +bool
>>>>>>>>>> full_recovery)
>>>>>>>>>>>       	 * GPU recovers can't run in parallel.
>>>>>>>>>>>       	 */
>>>>>>>>>>>       	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>> node)
>>>>>>>>>> {
>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job-
>>>>>> s_fence-
>>>>>>>>>>> parent :
>>>>>>>>>>> +NULL;
>>>>>>>>>>>
>>>>>>>>>>>       		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>
>>>>>>>>> _______________________________________________
>>>>>>>>> amd-gfx mailing list
>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 10:35                                                 ` Koenig, Christian
  0 siblings, 0 replies; 80+ messages in thread
From: Koenig, Christian @ 2019-11-08 10:35 UTC (permalink / raw)
  To: Deng, Emily, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Emily,

exactly that can't happen. See here:

>         /* Don't destroy jobs while the timeout worker is running */
>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>             !cancel_delayed_work(&sched->work_tdr))
>                 return NULL;

We never free jobs while the timeout working is running to prevent 
exactly that issue.

Regards,
Christian.

Am 08.11.19 um 11:32 schrieb Deng, Emily:
> Hi Christian,
>       The drm_sched_job_timedout-> amdgpu_job_timedout call amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Friday, November 8, 2019 6:26 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>> Hi Emily,
>>
>> well who is calling amdgpu_device_gpu_recover() in this case?
>>
>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>
>> Regards,
>> Christian.
>>
>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>> Hi Chrisitan,
>>>        No, I am with the new branch and also has the patch. Even it are freed by
>> main scheduler, how we could avoid main scheduler to free jobs while enter
>> to function amdgpu_device_gpu_recover?
>>> Best wishes
>>> Emily Deng
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>> Hi Emily,
>>>>
>>>> in this case you are on an old code branch.
>>>>
>>>> Jobs are freed now by the main scheduler thread and only if no
>>>> timeout handler is running.
>>>>
>>>> See this patch here:
>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>
>>>>>       drm/scheduler: rework job destruction
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>> Hi Christian,
>>>>>         Please refer to follow log, when it enter to
>>>>> amdgpu_device_gpu_recover
>>>> function, the bad job 000000005086879e is freeing in function
>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>> signal.
>>>> But amdgpu_device_gpu_recover goes faster, at this case, the s_fence
>>>> is already freed, but job is not freed in time. Then this issue occurs.
>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>> sdma0
>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>> amdgpu
>>>> 0000:00:08.0: GPU reset begin!
>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information: process
>>>>> pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>> pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>> pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>> pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>> pid 0,
>>>> s_job:00000000ea85e922 [  449.794287]
>>>> Emily:amdgpu_job_free_cb,Process
>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference at
>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops: 0000
>>>> [#1] SMP PTI
>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [  449.803488]
>> RIP:
>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff ff
>>>>> 45 85 e4 0f
>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10 <48> 8b
>> 98
>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000 CR0:
>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>> gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>
>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>> Sorry, please take your time.
>>>>>> Have you seen my other response a bit below?
>>>>>>
>>>>>> I can't follow how it would be possible for job->s_fence to be NULL
>>>>>> without the job also being freed.
>>>>>>
>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>> Best wishes
>>>>>>> Emily Deng
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>> gfx@lists.freedesktop.org
>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>> tdr
>>>>>>>>
>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>> Ping.....
>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>
>>>>>>>>> Best wishes
>>>>>>>>> Emily Deng
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>> Behalf
>>>>>>>>>> Of Deng, Emily
>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>>> tdr
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>
>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>> When the job is already signaled, the s_fence is freed. Then
>>>>>>>>>>>> it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will go
>>>>>>>>>> to free
>>>> job.
>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At that
>>>>>>>>>> time, job is not freed, but s_fence is already NULL.
>>>>>>>> No, that case can't happen. See here:
>>>>>>>>
>>>>>>>>>             drm_sched_job_cleanup(s_job);
>>>>>>>>>
>>>>>>>>>             amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>             dma_fence_put(job->fence);
>>>>>>>>>             amdgpu_sync_free(&job->sync);
>>>>>>>>>             amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>             kfree(job);
>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>>>> to the
>>>> s_fence.
>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>>> patch is a clear NAK.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>> ---
>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>        drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>>>>>>>        2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>        	 *
>>>>>>>>>>>>        	 * job->base holds a reference to parent fence
>>>>>>>>>>>>        	 */
>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent
>>>>>> &&
>>>>>>>>>>>>        	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>        		job_signaled = true;
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>> *bad)
>>>>>>>>>>>>
>>>>>>>>>>>>        			spin_lock(&rq->lock);
>>>>>>>>>>>>        			list_for_each_entry_safe(entity, tmp,
>> &rq-
>>>>>>> entities,
>>>>>>>>>>> list) {
>>>>>>>>>>>> -				if (bad->s_fence->scheduled.context
>>>>>> ==
>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>        					if (atomic_read(&bad-
>>>>>>> karma) >
>>>>>>>>>>>>        					    bad->sched-
>>> hang_limit)
>>>>>>>>>>>>        						if (entity-
>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>> drm_sched_stop(struct
>>>>>> drm_gpu_scheduler
>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>        	 * This iteration is thread safe as sched thread is
>> stopped.
>>>>>>>>>>>>        	 */
>>>>>>>>>>>>        	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>>>>>>>> ring_mirror_list, node) {
>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>>>>>>        		    dma_fence_remove_callback(s_job-
>>> s_fence-
>>>>>>> parent,
>>>>>>>>>>>>        					      &s_job->cb)) {
>>>>>>>>>>>>        			atomic_dec(&sched->hw_rq_count);
>> @@ -
>>>>>> 395,7
>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>        			 *
>>>>>>>>>>>>        			 * Job is still alive so fence refcount at
>> least 1
>>>>>>>>>>>>        			 */
>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence->finished,
>>>>>> false);
>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>> +				dma_fence_wait(&s_job->s_fence-
>>>>>>> finished,
>>>>>>>>>>> false);
>>>>>>>>>>>>        			/*
>>>>>>>>>>>>        			 * We must keep bad job alive for later
>> use
>>>>>> during @@
>>>>>>>>>>> -438,7
>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>> *sched,
>>>>>>>>>>>> +bool
>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>        	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>        	 */
>>>>>>>>>>>>        	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>> node)
>>>>>>>>>>> {
>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job-
>>>>>>> s_fence-
>>>>>>>>>>>> parent :
>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>
>>>>>>>>>>>>        		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>
>>>>>>>>>> _______________________________________________
>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 10:35                                                 ` Koenig, Christian
  0 siblings, 0 replies; 80+ messages in thread
From: Koenig, Christian @ 2019-11-08 10:35 UTC (permalink / raw)
  To: Deng, Emily, amd-gfx

Hi Emily,

exactly that can't happen. See here:

>         /* Don't destroy jobs while the timeout worker is running */
>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>             !cancel_delayed_work(&sched->work_tdr))
>                 return NULL;

We never free jobs while the timeout working is running to prevent 
exactly that issue.

Regards,
Christian.

Am 08.11.19 um 11:32 schrieb Deng, Emily:
> Hi Christian,
>       The drm_sched_job_timedout-> amdgpu_job_timedout call amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Friday, November 8, 2019 6:26 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>> Hi Emily,
>>
>> well who is calling amdgpu_device_gpu_recover() in this case?
>>
>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>
>> Regards,
>> Christian.
>>
>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>> Hi Chrisitan,
>>>        No, I am with the new branch and also has the patch. Even it are freed by
>> main scheduler, how we could avoid main scheduler to free jobs while enter
>> to function amdgpu_device_gpu_recover?
>>> Best wishes
>>> Emily Deng
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>> Hi Emily,
>>>>
>>>> in this case you are on an old code branch.
>>>>
>>>> Jobs are freed now by the main scheduler thread and only if no
>>>> timeout handler is running.
>>>>
>>>> See this patch here:
>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>
>>>>>       drm/scheduler: rework job destruction
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>> Hi Christian,
>>>>>         Please refer to follow log, when it enter to
>>>>> amdgpu_device_gpu_recover
>>>> function, the bad job 000000005086879e is freeing in function
>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>> signal.
>>>> But amdgpu_device_gpu_recover goes faster, at this case, the s_fence
>>>> is already freed, but job is not freed in time. Then this issue occurs.
>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>> sdma0
>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>> amdgpu
>>>> 0000:00:08.0: GPU reset begin!
>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information: process
>>>>> pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>> pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>> pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>> pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>> pid 0,
>>>> s_job:00000000ea85e922 [  449.794287]
>>>> Emily:amdgpu_job_free_cb,Process
>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference at
>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops: 0000
>>>> [#1] SMP PTI
>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [  449.803488]
>> RIP:
>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff ff
>>>>> 45 85 e4 0f
>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10 <48> 8b
>> 98
>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000 CR0:
>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>> gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>
>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>> Sorry, please take your time.
>>>>>> Have you seen my other response a bit below?
>>>>>>
>>>>>> I can't follow how it would be possible for job->s_fence to be NULL
>>>>>> without the job also being freed.
>>>>>>
>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>> Best wishes
>>>>>>> Emily Deng
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>> gfx@lists.freedesktop.org
>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>> tdr
>>>>>>>>
>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>> Ping.....
>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>
>>>>>>>>> Best wishes
>>>>>>>>> Emily Deng
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>> Behalf
>>>>>>>>>> Of Deng, Emily
>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>>> tdr
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>
>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>> When the job is already signaled, the s_fence is freed. Then
>>>>>>>>>>>> it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will go
>>>>>>>>>> to free
>>>> job.
>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At that
>>>>>>>>>> time, job is not freed, but s_fence is already NULL.
>>>>>>>> No, that case can't happen. See here:
>>>>>>>>
>>>>>>>>>             drm_sched_job_cleanup(s_job);
>>>>>>>>>
>>>>>>>>>             amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>             dma_fence_put(job->fence);
>>>>>>>>>             amdgpu_sync_free(&job->sync);
>>>>>>>>>             amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>             kfree(job);
>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>>>> to the
>>>> s_fence.
>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>>> patch is a clear NAK.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>> ---
>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>        drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>>>>>>>        2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>        	 *
>>>>>>>>>>>>        	 * job->base holds a reference to parent fence
>>>>>>>>>>>>        	 */
>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent
>>>>>> &&
>>>>>>>>>>>>        	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>        		job_signaled = true;
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>> *bad)
>>>>>>>>>>>>
>>>>>>>>>>>>        			spin_lock(&rq->lock);
>>>>>>>>>>>>        			list_for_each_entry_safe(entity, tmp,
>> &rq-
>>>>>>> entities,
>>>>>>>>>>> list) {
>>>>>>>>>>>> -				if (bad->s_fence->scheduled.context
>>>>>> ==
>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>        					if (atomic_read(&bad-
>>>>>>> karma) >
>>>>>>>>>>>>        					    bad->sched-
>>> hang_limit)
>>>>>>>>>>>>        						if (entity-
>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>> drm_sched_stop(struct
>>>>>> drm_gpu_scheduler
>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>        	 * This iteration is thread safe as sched thread is
>> stopped.
>>>>>>>>>>>>        	 */
>>>>>>>>>>>>        	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>>>>>>>> ring_mirror_list, node) {
>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>>>>>>        		    dma_fence_remove_callback(s_job-
>>> s_fence-
>>>>>>> parent,
>>>>>>>>>>>>        					      &s_job->cb)) {
>>>>>>>>>>>>        			atomic_dec(&sched->hw_rq_count);
>> @@ -
>>>>>> 395,7
>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>        			 *
>>>>>>>>>>>>        			 * Job is still alive so fence refcount at
>> least 1
>>>>>>>>>>>>        			 */
>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence->finished,
>>>>>> false);
>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>> +				dma_fence_wait(&s_job->s_fence-
>>>>>>> finished,
>>>>>>>>>>> false);
>>>>>>>>>>>>        			/*
>>>>>>>>>>>>        			 * We must keep bad job alive for later
>> use
>>>>>> during @@
>>>>>>>>>>> -438,7
>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>> *sched,
>>>>>>>>>>>> +bool
>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>        	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>        	 */
>>>>>>>>>>>>        	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>> node)
>>>>>>>>>>> {
>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job-
>>>>>>> s_fence-
>>>>>>>>>>>> parent :
>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>
>>>>>>>>>>>>        		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>
>>>>>>>>>> _______________________________________________
>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 10:54                                                     ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-08 10:54 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Christian,
     Sorry, seems I understand wrong. And from the print, the free job's thread is the same as job timeout thread. So seems have some issue in function amdgpu_device_gpu_recover.


Best wishes
Emily Deng



>-----Original Message-----
>From: Koenig, Christian <Christian.Koenig@amd.com>
>Sent: Friday, November 8, 2019 6:35 PM
>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Hi Emily,
>
>exactly that can't happen. See here:
>
>>         /* Don't destroy jobs while the timeout worker is running */
>>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>             !cancel_delayed_work(&sched->work_tdr))
>>                 return NULL;
>
>We never free jobs while the timeout working is running to prevent exactly
>that issue.
>
>Regards,
>Christian.
>
>Am 08.11.19 um 11:32 schrieb Deng, Emily:
>> Hi Christian,
>>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Friday, November 8, 2019 6:26 PM
>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Hi Emily,
>>>
>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>
>>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>> Hi Chrisitan,
>>>>        No, I am with the new branch and also has the patch. Even it
>>>> are freed by
>>> main scheduler, how we could avoid main scheduler to free jobs while
>>> enter to function amdgpu_device_gpu_recover?
>>>> Best wishes
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Hi Emily,
>>>>>
>>>>> in this case you are on an old code branch.
>>>>>
>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>> timeout handler is running.
>>>>>
>>>>> See this patch here:
>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>
>>>>>>       drm/scheduler: rework job destruction
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>> Hi Christian,
>>>>>>         Please refer to follow log, when it enter to
>>>>>> amdgpu_device_gpu_recover
>>>>> function, the bad job 000000005086879e is freeing in function
>>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>> signal.
>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>> s_fence is already freed, but job is not freed in time. Then this issue
>occurs.
>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>> sdma0
>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>>> amdgpu
>>>>> 0000:00:08.0: GPU reset begin!
>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> thread pid 0,
>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>> Emily:amdgpu_job_free_cb,Process
>>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>>>>> at
>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>> 0000 [#1] SMP PTI
>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>> 449.803488]
>>> RIP:
>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>>>> ff
>>>>>> 45 85 e4 0f
>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>> <48> 8b
>>> 98
>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000
>CR0:
>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>> gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> tdr
>>>>>>>
>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>> Sorry, please take your time.
>>>>>>> Have you seen my other response a bit below?
>>>>>>>
>>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>>> NULL without the job also being freed.
>>>>>>>
>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>> Best wishes
>>>>>>>> Emily Deng
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>> gfx@lists.freedesktop.org
>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>> tdr
>>>>>>>>>
>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>> Ping.....
>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>
>>>>>>>>>> Best wishes
>>>>>>>>>> Emily Deng
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>> Behalf
>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>
>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>>>>>>>>>>> go to free
>>>>> job.
>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>
>>>>>>>>>>             drm_sched_job_cleanup(s_job);
>>>>>>>>>>
>>>>>>>>>>             amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>             dma_fence_put(job->fence);
>>>>>>>>>>             amdgpu_sync_free(&job->sync);
>>>>>>>>>>             amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>             kfree(job);
>>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>>>>> to the
>>>>> s_fence.
>>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>>>> patch is a clear NAK.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>>        drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>--
>>>>>>>>>>>>>        2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>        	 *
>>>>>>>>>>>>>        	 * job->base holds a reference to parent fence
>>>>>>>>>>>>>        	 */
>>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence-
>>parent
>>>>>>> &&
>>>>>>>>>>>>>        	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>>        		job_signaled = true;
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>drm_sched_increase_karma(struct
>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>
>>>>>>>>>>>>>        			spin_lock(&rq->lock);
>>>>>>>>>>>>>        			list_for_each_entry_safe(entity, tmp,
>>> &rq-
>>>>>>>> entities,
>>>>>>>>>>>> list) {
>>>>>>>>>>>>> -				if (bad->s_fence-
>>scheduled.context
>>>>>>> ==
>>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>>> +				if (bad->s_fence && (bad-
>>s_fence-
>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>>        					if (atomic_read(&bad-
>>>>>>>> karma) >
>>>>>>>>>>>>>        					    bad->sched-
>>>> hang_limit)
>>>>>>>>>>>>>        						if (entity-
>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>        	 * This iteration is thread safe as sched thread is
>>> stopped.
>>>>>>>>>>>>>        	 */
>>>>>>>>>>>>>        	list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent
>&&
>>>>>>>>>>>>>        		    dma_fence_remove_callback(s_job-
>>>> s_fence-
>>>>>>>> parent,
>>>>>>>>>>>>>        					      &s_job->cb)) {
>>>>>>>>>>>>>        			atomic_dec(&sched->hw_rq_count);
>>> @@ -
>>>>>>> 395,7
>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>        			 *
>>>>>>>>>>>>>        			 * Job is still alive so fence refcount at
>>> least 1
>>>>>>>>>>>>>        			 */
>>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence-
>>finished,
>>>>>>> false);
>>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>>> +				dma_fence_wait(&s_job-
>>s_fence-
>>>>>>>> finished,
>>>>>>>>>>>> false);
>>>>>>>>>>>>>        			/*
>>>>>>>>>>>>>        			 * We must keep bad job alive for later
>>> use
>>>>>>> during @@
>>>>>>>>>>>> -438,7
>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>> *sched,
>>>>>>>>>>>>> +bool
>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>        	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>>        	 */
>>>>>>>>>>>>>        	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>> node)
>>>>>>>>>>>> {
>>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence-
>>parent;
>>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ?
>s_job-
>>>>>>>> s_fence-
>>>>>>>>>>>>> parent :
>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>
>>>>>>>>>>>>>        		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>
>>>>>>>>>>> _______________________________________________
>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 10:54                                                     ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-08 10:54 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx

Hi Christian,
     Sorry, seems I understand wrong. And from the print, the free job's thread is the same as job timeout thread. So seems have some issue in function amdgpu_device_gpu_recover.


Best wishes
Emily Deng



>-----Original Message-----
>From: Koenig, Christian <Christian.Koenig@amd.com>
>Sent: Friday, November 8, 2019 6:35 PM
>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Hi Emily,
>
>exactly that can't happen. See here:
>
>>         /* Don't destroy jobs while the timeout worker is running */
>>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>             !cancel_delayed_work(&sched->work_tdr))
>>                 return NULL;
>
>We never free jobs while the timeout working is running to prevent exactly
>that issue.
>
>Regards,
>Christian.
>
>Am 08.11.19 um 11:32 schrieb Deng, Emily:
>> Hi Christian,
>>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Friday, November 8, 2019 6:26 PM
>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Hi Emily,
>>>
>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>
>>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>> Hi Chrisitan,
>>>>        No, I am with the new branch and also has the patch. Even it
>>>> are freed by
>>> main scheduler, how we could avoid main scheduler to free jobs while
>>> enter to function amdgpu_device_gpu_recover?
>>>> Best wishes
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Hi Emily,
>>>>>
>>>>> in this case you are on an old code branch.
>>>>>
>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>> timeout handler is running.
>>>>>
>>>>> See this patch here:
>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>
>>>>>>       drm/scheduler: rework job destruction
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>> Hi Christian,
>>>>>>         Please refer to follow log, when it enter to
>>>>>> amdgpu_device_gpu_recover
>>>>> function, the bad job 000000005086879e is freeing in function
>>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>> signal.
>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>> s_fence is already freed, but job is not freed in time. Then this issue
>occurs.
>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>> sdma0
>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>>> amdgpu
>>>>> 0000:00:08.0: GPU reset begin!
>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> thread pid 0,
>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>> Emily:amdgpu_job_free_cb,Process
>>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>>>>> at
>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>> 0000 [#1] SMP PTI
>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>> 449.803488]
>>> RIP:
>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>>>> ff
>>>>>> 45 85 e4 0f
>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>> <48> 8b
>>> 98
>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000
>CR0:
>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>> gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> tdr
>>>>>>>
>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>> Sorry, please take your time.
>>>>>>> Have you seen my other response a bit below?
>>>>>>>
>>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>>> NULL without the job also being freed.
>>>>>>>
>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>> Best wishes
>>>>>>>> Emily Deng
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>> gfx@lists.freedesktop.org
>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>> tdr
>>>>>>>>>
>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>> Ping.....
>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>
>>>>>>>>>> Best wishes
>>>>>>>>>> Emily Deng
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>> Behalf
>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>
>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>>>>>>>>>>> go to free
>>>>> job.
>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>
>>>>>>>>>>             drm_sched_job_cleanup(s_job);
>>>>>>>>>>
>>>>>>>>>>             amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>             dma_fence_put(job->fence);
>>>>>>>>>>             amdgpu_sync_free(&job->sync);
>>>>>>>>>>             amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>             kfree(job);
>>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>>>>> to the
>>>>> s_fence.
>>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>>>> patch is a clear NAK.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>>        drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>--
>>>>>>>>>>>>>        2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>        	 *
>>>>>>>>>>>>>        	 * job->base holds a reference to parent fence
>>>>>>>>>>>>>        	 */
>>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence-
>>parent
>>>>>>> &&
>>>>>>>>>>>>>        	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>>        		job_signaled = true;
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>drm_sched_increase_karma(struct
>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>
>>>>>>>>>>>>>        			spin_lock(&rq->lock);
>>>>>>>>>>>>>        			list_for_each_entry_safe(entity, tmp,
>>> &rq-
>>>>>>>> entities,
>>>>>>>>>>>> list) {
>>>>>>>>>>>>> -				if (bad->s_fence-
>>scheduled.context
>>>>>>> ==
>>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>>> +				if (bad->s_fence && (bad-
>>s_fence-
>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>>        					if (atomic_read(&bad-
>>>>>>>> karma) >
>>>>>>>>>>>>>        					    bad->sched-
>>>> hang_limit)
>>>>>>>>>>>>>        						if (entity-
>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>        	 * This iteration is thread safe as sched thread is
>>> stopped.
>>>>>>>>>>>>>        	 */
>>>>>>>>>>>>>        	list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent
>&&
>>>>>>>>>>>>>        		    dma_fence_remove_callback(s_job-
>>>> s_fence-
>>>>>>>> parent,
>>>>>>>>>>>>>        					      &s_job->cb)) {
>>>>>>>>>>>>>        			atomic_dec(&sched->hw_rq_count);
>>> @@ -
>>>>>>> 395,7
>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>        			 *
>>>>>>>>>>>>>        			 * Job is still alive so fence refcount at
>>> least 1
>>>>>>>>>>>>>        			 */
>>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence-
>>finished,
>>>>>>> false);
>>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>>> +				dma_fence_wait(&s_job-
>>s_fence-
>>>>>>>> finished,
>>>>>>>>>>>> false);
>>>>>>>>>>>>>        			/*
>>>>>>>>>>>>>        			 * We must keep bad job alive for later
>>> use
>>>>>>> during @@
>>>>>>>>>>>> -438,7
>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>> *sched,
>>>>>>>>>>>>> +bool
>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>        	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>>        	 */
>>>>>>>>>>>>>        	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>> node)
>>>>>>>>>>>> {
>>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence-
>>parent;
>>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ?
>s_job-
>>>>>>>> s_fence-
>>>>>>>>>>>>> parent :
>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>
>>>>>>>>>>>>>        		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>
>>>>>>>>>>> _______________________________________________
>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 19:01                                                     ` Grodzovsky, Andrey
  0 siblings, 0 replies; 80+ messages in thread
From: Grodzovsky, Andrey @ 2019-11-08 19:01 UTC (permalink / raw)
  To: Koenig, Christian, Deng, Emily, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


On 11/8/19 5:35 AM, Koenig, Christian wrote:
> Hi Emily,
>
> exactly that can't happen. See here:
>
>>          /* Don't destroy jobs while the timeout worker is running */
>>          if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>              !cancel_delayed_work(&sched->work_tdr))
>>                  return NULL;
> We never free jobs while the timeout working is running to prevent
> exactly that issue.


I don't think this protects us if drm_sched_cleanup_jobs is called for 
scheduler which didn't experience a timeout, in 
amdgpu_device_gpu_recover we access sched->ring_mirror_list for all the 
schedulers on a device so this condition above won't protect us. What in 
fact could help maybe is my recent patch 541c521 drm/sched: Avoid job 
cleanup if sched thread is parked. because we do park each of the 
scheduler threads during tdr job before trying to access 
sched->ring_mirror_list.

Emily - did you see this problem with that patch in place ? I only 
pushed it yesterday.

Andrey


>
> Regards,
> Christian.
>
> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>> Hi Christian,
>>        The drm_sched_job_timedout-> amdgpu_job_timedout call amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Friday, November 8, 2019 6:26 PM
>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Hi Emily,
>>>
>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>
>>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>> Hi Chrisitan,
>>>>         No, I am with the new branch and also has the patch. Even it are freed by
>>> main scheduler, how we could avoid main scheduler to free jobs while enter
>>> to function amdgpu_device_gpu_recover?
>>>> Best wishes
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Hi Emily,
>>>>>
>>>>> in this case you are on an old code branch.
>>>>>
>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>> timeout handler is running.
>>>>>
>>>>> See this patch here:
>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>
>>>>>>        drm/scheduler: rework job destruction
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>> Hi Christian,
>>>>>>          Please refer to follow log, when it enter to
>>>>>> amdgpu_device_gpu_recover
>>>>> function, the bad job 000000005086879e is freeing in function
>>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>> signal.
>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the s_fence
>>>>> is already freed, but job is not freed in time. Then this issue occurs.
>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>> sdma0
>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>>> amdgpu
>>>>> 0000:00:08.0: GPU reset begin!
>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information: process
>>>>>> pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>>> pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>>> pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>>> pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>>> pid 0,
>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>> Emily:amdgpu_job_free_cb,Process
>>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference at
>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops: 0000
>>>>> [#1] SMP PTI
>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [  449.803488]
>>> RIP:
>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff ff
>>>>>> 45 85 e4 0f
>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10 <48> 8b
>>> 98
>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000 CR0:
>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>> gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>>
>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>> Sorry, please take your time.
>>>>>>> Have you seen my other response a bit below?
>>>>>>>
>>>>>>> I can't follow how it would be possible for job->s_fence to be NULL
>>>>>>> without the job also being freed.
>>>>>>>
>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>> Best wishes
>>>>>>>> Emily Deng
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>> gfx@lists.freedesktop.org
>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>> tdr
>>>>>>>>>
>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>> Ping.....
>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>
>>>>>>>>>> Best wishes
>>>>>>>>>> Emily Deng
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>> Behalf
>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>>>> tdr
>>>>>>>>>>>
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>
>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed. Then
>>>>>>>>>>>>> it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will go
>>>>>>>>>>> to free
>>>>> job.
>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At that
>>>>>>>>>>> time, job is not freed, but s_fence is already NULL.
>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>
>>>>>>>>>>              drm_sched_job_cleanup(s_job);
>>>>>>>>>>
>>>>>>>>>>              amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>              dma_fence_put(job->fence);
>>>>>>>>>>              amdgpu_sync_free(&job->sync);
>>>>>>>>>>              amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>              kfree(job);
>>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>>>>> to the
>>>>> s_fence.
>>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>>>> patch is a clear NAK.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>>         drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>>>>>>>>         2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>         	 *
>>>>>>>>>>>>>         	 * job->base holds a reference to parent fence
>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent
>>>>>>> &&
>>>>>>>>>>>>>         	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>>         		job_signaled = true;
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>
>>>>>>>>>>>>>         			spin_lock(&rq->lock);
>>>>>>>>>>>>>         			list_for_each_entry_safe(entity, tmp,
>>> &rq-
>>>>>>>> entities,
>>>>>>>>>>>> list) {
>>>>>>>>>>>>> -				if (bad->s_fence->scheduled.context
>>>>>>> ==
>>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>>         					if (atomic_read(&bad-
>>>>>>>> karma) >
>>>>>>>>>>>>>         					    bad->sched-
>>>> hang_limit)
>>>>>>>>>>>>>         						if (entity-
>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>         	 * This iteration is thread safe as sched thread is
>>> stopped.
>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>         	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>>>>>>>>> ring_mirror_list, node) {
>>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>>>>>>>         		    dma_fence_remove_callback(s_job-
>>>> s_fence-
>>>>>>>> parent,
>>>>>>>>>>>>>         					      &s_job->cb)) {
>>>>>>>>>>>>>         			atomic_dec(&sched->hw_rq_count);
>>> @@ -
>>>>>>> 395,7
>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>         			 *
>>>>>>>>>>>>>         			 * Job is still alive so fence refcount at
>>> least 1
>>>>>>>>>>>>>         			 */
>>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence->finished,
>>>>>>> false);
>>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>>> +				dma_fence_wait(&s_job->s_fence-
>>>>>>>> finished,
>>>>>>>>>>>> false);
>>>>>>>>>>>>>         			/*
>>>>>>>>>>>>>         			 * We must keep bad job alive for later
>>> use
>>>>>>> during @@
>>>>>>>>>>>> -438,7
>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>> *sched,
>>>>>>>>>>>>> +bool
>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>         	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>         	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>> node)
>>>>>>>>>>>> {
>>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job-
>>>>>>>> s_fence-
>>>>>>>>>>>>> parent :
>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>
>>>>>>>>>>>>>         		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>
>>>>>>>>>>> _______________________________________________
>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 19:01                                                     ` Grodzovsky, Andrey
  0 siblings, 0 replies; 80+ messages in thread
From: Grodzovsky, Andrey @ 2019-11-08 19:01 UTC (permalink / raw)
  To: Koenig, Christian, Deng, Emily, amd-gfx


On 11/8/19 5:35 AM, Koenig, Christian wrote:
> Hi Emily,
>
> exactly that can't happen. See here:
>
>>          /* Don't destroy jobs while the timeout worker is running */
>>          if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>              !cancel_delayed_work(&sched->work_tdr))
>>                  return NULL;
> We never free jobs while the timeout working is running to prevent
> exactly that issue.


I don't think this protects us if drm_sched_cleanup_jobs is called for 
scheduler which didn't experience a timeout, in 
amdgpu_device_gpu_recover we access sched->ring_mirror_list for all the 
schedulers on a device so this condition above won't protect us. What in 
fact could help maybe is my recent patch 541c521 drm/sched: Avoid job 
cleanup if sched thread is parked. because we do park each of the 
scheduler threads during tdr job before trying to access 
sched->ring_mirror_list.

Emily - did you see this problem with that patch in place ? I only 
pushed it yesterday.

Andrey


>
> Regards,
> Christian.
>
> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>> Hi Christian,
>>        The drm_sched_job_timedout-> amdgpu_job_timedout call amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Friday, November 8, 2019 6:26 PM
>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Hi Emily,
>>>
>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>
>>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>> Hi Chrisitan,
>>>>         No, I am with the new branch and also has the patch. Even it are freed by
>>> main scheduler, how we could avoid main scheduler to free jobs while enter
>>> to function amdgpu_device_gpu_recover?
>>>> Best wishes
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Hi Emily,
>>>>>
>>>>> in this case you are on an old code branch.
>>>>>
>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>> timeout handler is running.
>>>>>
>>>>> See this patch here:
>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>
>>>>>>        drm/scheduler: rework job destruction
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>> Hi Christian,
>>>>>>          Please refer to follow log, when it enter to
>>>>>> amdgpu_device_gpu_recover
>>>>> function, the bad job 000000005086879e is freeing in function
>>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>> signal.
>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the s_fence
>>>>> is already freed, but job is not freed in time. Then this issue occurs.
>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>> sdma0
>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>>> amdgpu
>>>>> 0000:00:08.0: GPU reset begin!
>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information: process
>>>>>> pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>>> pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>>> pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>>> pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>>> pid 0,
>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>> Emily:amdgpu_job_free_cb,Process
>>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference at
>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops: 0000
>>>>> [#1] SMP PTI
>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [  449.803488]
>>> RIP:
>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff ff
>>>>>> 45 85 e4 0f
>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10 <48> 8b
>>> 98
>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000 CR0:
>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>> gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>>
>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>> Sorry, please take your time.
>>>>>>> Have you seen my other response a bit below?
>>>>>>>
>>>>>>> I can't follow how it would be possible for job->s_fence to be NULL
>>>>>>> without the job also being freed.
>>>>>>>
>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>> Best wishes
>>>>>>>> Emily Deng
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>> gfx@lists.freedesktop.org
>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>> tdr
>>>>>>>>>
>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>> Ping.....
>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>
>>>>>>>>>> Best wishes
>>>>>>>>>> Emily Deng
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>> Behalf
>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>>>> tdr
>>>>>>>>>>>
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>
>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed. Then
>>>>>>>>>>>>> it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will go
>>>>>>>>>>> to free
>>>>> job.
>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At that
>>>>>>>>>>> time, job is not freed, but s_fence is already NULL.
>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>
>>>>>>>>>>              drm_sched_job_cleanup(s_job);
>>>>>>>>>>
>>>>>>>>>>              amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>              dma_fence_put(job->fence);
>>>>>>>>>>              amdgpu_sync_free(&job->sync);
>>>>>>>>>>              amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>              kfree(job);
>>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>>>>> to the
>>>>> s_fence.
>>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>>>> patch is a clear NAK.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>>         drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>>>>>>>>         2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>         	 *
>>>>>>>>>>>>>         	 * job->base holds a reference to parent fence
>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence->parent
>>>>>>> &&
>>>>>>>>>>>>>         	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>>         		job_signaled = true;
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>
>>>>>>>>>>>>>         			spin_lock(&rq->lock);
>>>>>>>>>>>>>         			list_for_each_entry_safe(entity, tmp,
>>> &rq-
>>>>>>>> entities,
>>>>>>>>>>>> list) {
>>>>>>>>>>>>> -				if (bad->s_fence->scheduled.context
>>>>>>> ==
>>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>>> +				if (bad->s_fence && (bad->s_fence-
>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>>         					if (atomic_read(&bad-
>>>>>>>> karma) >
>>>>>>>>>>>>>         					    bad->sched-
>>>> hang_limit)
>>>>>>>>>>>>>         						if (entity-
>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>         	 * This iteration is thread safe as sched thread is
>>> stopped.
>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>         	list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>>>>>>>>> ring_mirror_list, node) {
>>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>>>>>>>         		    dma_fence_remove_callback(s_job-
>>>> s_fence-
>>>>>>>> parent,
>>>>>>>>>>>>>         					      &s_job->cb)) {
>>>>>>>>>>>>>         			atomic_dec(&sched->hw_rq_count);
>>> @@ -
>>>>>>> 395,7
>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>         			 *
>>>>>>>>>>>>>         			 * Job is still alive so fence refcount at
>>> least 1
>>>>>>>>>>>>>         			 */
>>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence->finished,
>>>>>>> false);
>>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>>> +				dma_fence_wait(&s_job->s_fence-
>>>>>>>> finished,
>>>>>>>>>>>> false);
>>>>>>>>>>>>>         			/*
>>>>>>>>>>>>>         			 * We must keep bad job alive for later
>>> use
>>>>>>> during @@
>>>>>>>>>>>> -438,7
>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>> *sched,
>>>>>>>>>>>>> +bool
>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>         	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>         	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>> node)
>>>>>>>>>>>> {
>>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ? s_job-
>>>>>>>> s_fence-
>>>>>>>>>>>>> parent :
>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>
>>>>>>>>>>>>>         		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>
>>>>>>>>>>> _______________________________________________
>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 19:04                                                         ` Grodzovsky, Andrey
  0 siblings, 0 replies; 80+ messages in thread
From: Grodzovsky, Andrey @ 2019-11-08 19:04 UTC (permalink / raw)
  To: Deng, Emily, Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


On 11/8/19 5:54 AM, Deng, Emily wrote:
> Hi Christian,
>       Sorry, seems I understand wrong. And from the print, the free job's thread is the same as job timeout thread. So seems have some issue in function amdgpu_device_gpu_recover.

I don't think it's correct, seems your prints just don't print the pids 
because it's all zeros which cannot be true for any kernel or user mode 
thread. In fact looking at the OOPs i see the actual TDR job pid which is 55

CPU: 3 PID: 55 Comm: kworker/3:1 Tainted

Andrey


>
>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Friday, November 8, 2019 6:35 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>> Hi Emily,
>>
>> exactly that can't happen. See here:
>>
>>>          /* Don't destroy jobs while the timeout worker is running */
>>>          if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>              !cancel_delayed_work(&sched->work_tdr))
>>>                  return NULL;
>> We never free jobs while the timeout working is running to prevent exactly
>> that issue.
>>
>> Regards,
>> Christian.
>>
>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>> Hi Christian,
>>>        The drm_sched_job_timedout-> amdgpu_job_timedout call
>> amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>> in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>> Best wishes
>>> Emily Deng
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>> Hi Emily,
>>>>
>>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>
>>>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>> Hi Chrisitan,
>>>>>         No, I am with the new branch and also has the patch. Even it
>>>>> are freed by
>>>> main scheduler, how we could avoid main scheduler to free jobs while
>>>> enter to function amdgpu_device_gpu_recover?
>>>>> Best wishes
>>>>> Emily Deng
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>> gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>
>>>>>> Hi Emily,
>>>>>>
>>>>>> in this case you are on an old code branch.
>>>>>>
>>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>>> timeout handler is running.
>>>>>>
>>>>>> See this patch here:
>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>
>>>>>>>        drm/scheduler: rework job destruction
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>> Hi Christian,
>>>>>>>          Please refer to follow log, when it enter to
>>>>>>> amdgpu_device_gpu_recover
>>>>>> function, the bad job 000000005086879e is freeing in function
>>>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>>> signal.
>>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>> s_fence is already freed, but job is not freed in time. Then this issue
>> occurs.
>>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>> sdma0
>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>>>> amdgpu
>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> thread pid 0,
>>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>>>>>> at
>>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>>> 0000 [#1] SMP PTI
>>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>> 449.803488]
>>>> RIP:
>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>>>>> ff
>>>>>>> 45 85 e4 0f
>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>>> <48> 8b
>>>> 98
>>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000
>> CR0:
>>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>> gfx@lists.freedesktop.org
>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>> tdr
>>>>>>>>
>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>> Sorry, please take your time.
>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>
>>>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>>>> NULL without the job also being freed.
>>>>>>>>
>>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>> Best wishes
>>>>>>>>> Emily Deng
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>>> tdr
>>>>>>>>>>
>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>> Ping.....
>>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>>
>>>>>>>>>>> Best wishes
>>>>>>>>>>> Emily Deng
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>>> Behalf
>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>
>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>
>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>>>>>>>>>>>> go to free
>>>>>> job.
>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>
>>>>>>>>>>>              drm_sched_job_cleanup(s_job);
>>>>>>>>>>>
>>>>>>>>>>>              amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>              dma_fence_put(job->fence);
>>>>>>>>>>>              amdgpu_sync_free(&job->sync);
>>>>>>>>>>>              amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>              kfree(job);
>>>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>>>>>> to the
>>>>>> s_fence.
>>>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>>>         drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>> --
>>>>>>>>>>>>>>         2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>         	 *
>>>>>>>>>>>>>>         	 * job->base holds a reference to parent fence
>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence-
>>> parent
>>>>>>>> &&
>>>>>>>>>>>>>>         	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>>>         		job_signaled = true;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>> drm_sched_increase_karma(struct
>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>         			spin_lock(&rq->lock);
>>>>>>>>>>>>>>         			list_for_each_entry_safe(entity, tmp,
>>>> &rq-
>>>>>>>>> entities,
>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>> -				if (bad->s_fence-
>>> scheduled.context
>>>>>>>> ==
>>>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>>>> +				if (bad->s_fence && (bad-
>>> s_fence-
>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>>>         					if (atomic_read(&bad-
>>>>>>>>> karma) >
>>>>>>>>>>>>>>         					    bad->sched-
>>>>> hang_limit)
>>>>>>>>>>>>>>         						if (entity-
>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>         	 * This iteration is thread safe as sched thread is
>>>> stopped.
>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>>         	list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent
>> &&
>>>>>>>>>>>>>>         		    dma_fence_remove_callback(s_job-
>>>>> s_fence-
>>>>>>>>> parent,
>>>>>>>>>>>>>>         					      &s_job->cb)) {
>>>>>>>>>>>>>>         			atomic_dec(&sched->hw_rq_count);
>>>> @@ -
>>>>>>>> 395,7
>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>         			 *
>>>>>>>>>>>>>>         			 * Job is still alive so fence refcount at
>>>> least 1
>>>>>>>>>>>>>>         			 */
>>>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence-
>>> finished,
>>>>>>>> false);
>>>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>>>> +				dma_fence_wait(&s_job-
>>> s_fence-
>>>>>>>>> finished,
>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>         			/*
>>>>>>>>>>>>>>         			 * We must keep bad job alive for later
>>>> use
>>>>>>>> during @@
>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>>> *sched,
>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>         	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>>         	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>> node)
>>>>>>>>>>>>> {
>>>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence-
>>> parent;
>>>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ?
>> s_job-
>>>>>>>>> s_fence-
>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>         		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>
>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-08 19:04                                                         ` Grodzovsky, Andrey
  0 siblings, 0 replies; 80+ messages in thread
From: Grodzovsky, Andrey @ 2019-11-08 19:04 UTC (permalink / raw)
  To: Deng, Emily, Koenig, Christian, amd-gfx


On 11/8/19 5:54 AM, Deng, Emily wrote:
> Hi Christian,
>       Sorry, seems I understand wrong. And from the print, the free job's thread is the same as job timeout thread. So seems have some issue in function amdgpu_device_gpu_recover.

I don't think it's correct, seems your prints just don't print the pids 
because it's all zeros which cannot be true for any kernel or user mode 
thread. In fact looking at the OOPs i see the actual TDR job pid which is 55

CPU: 3 PID: 55 Comm: kworker/3:1 Tainted

Andrey


>
>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Friday, November 8, 2019 6:35 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>> Hi Emily,
>>
>> exactly that can't happen. See here:
>>
>>>          /* Don't destroy jobs while the timeout worker is running */
>>>          if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>              !cancel_delayed_work(&sched->work_tdr))
>>>                  return NULL;
>> We never free jobs while the timeout working is running to prevent exactly
>> that issue.
>>
>> Regards,
>> Christian.
>>
>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>> Hi Christian,
>>>        The drm_sched_job_timedout-> amdgpu_job_timedout call
>> amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>> in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>> Best wishes
>>> Emily Deng
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>> Hi Emily,
>>>>
>>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>
>>>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>> Hi Chrisitan,
>>>>>         No, I am with the new branch and also has the patch. Even it
>>>>> are freed by
>>>> main scheduler, how we could avoid main scheduler to free jobs while
>>>> enter to function amdgpu_device_gpu_recover?
>>>>> Best wishes
>>>>> Emily Deng
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>> gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>
>>>>>> Hi Emily,
>>>>>>
>>>>>> in this case you are on an old code branch.
>>>>>>
>>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>>> timeout handler is running.
>>>>>>
>>>>>> See this patch here:
>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>
>>>>>>>        drm/scheduler: rework job destruction
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>> Hi Christian,
>>>>>>>          Please refer to follow log, when it enter to
>>>>>>> amdgpu_device_gpu_recover
>>>>>> function, the bad job 000000005086879e is freeing in function
>>>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>>> signal.
>>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>> s_fence is already freed, but job is not freed in time. Then this issue
>> occurs.
>>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>> sdma0
>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>>>> amdgpu
>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> thread pid 0,
>>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>>>>>> at
>>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>>> 0000 [#1] SMP PTI
>>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>> 449.803488]
>>>> RIP:
>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>>>>> ff
>>>>>>> 45 85 e4 0f
>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>>> <48> 8b
>>>> 98
>>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000
>> CR0:
>>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>> gfx@lists.freedesktop.org
>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>> tdr
>>>>>>>>
>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>> Sorry, please take your time.
>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>
>>>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>>>> NULL without the job also being freed.
>>>>>>>>
>>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>> Best wishes
>>>>>>>>> Emily Deng
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>>> tdr
>>>>>>>>>>
>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>> Ping.....
>>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>>
>>>>>>>>>>> Best wishes
>>>>>>>>>>> Emily Deng
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>>> Behalf
>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>
>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>
>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>>>>>>>>>>>> go to free
>>>>>> job.
>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>
>>>>>>>>>>>              drm_sched_job_cleanup(s_job);
>>>>>>>>>>>
>>>>>>>>>>>              amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>              dma_fence_put(job->fence);
>>>>>>>>>>>              amdgpu_sync_free(&job->sync);
>>>>>>>>>>>              amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>              kfree(job);
>>>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>>>>>> to the
>>>>>> s_fence.
>>>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>>>         drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>> --
>>>>>>>>>>>>>>         2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>         	 *
>>>>>>>>>>>>>>         	 * job->base holds a reference to parent fence
>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>> +	if (job && job->base.s_fence && job->base.s_fence-
>>> parent
>>>>>>>> &&
>>>>>>>>>>>>>>         	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>>>         		job_signaled = true;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>> drm_sched_increase_karma(struct
>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>         			spin_lock(&rq->lock);
>>>>>>>>>>>>>>         			list_for_each_entry_safe(entity, tmp,
>>>> &rq-
>>>>>>>>> entities,
>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>> -				if (bad->s_fence-
>>> scheduled.context
>>>>>>>> ==
>>>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>>>> +				if (bad->s_fence && (bad-
>>> s_fence-
>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>>>         					if (atomic_read(&bad-
>>>>>>>>> karma) >
>>>>>>>>>>>>>>         					    bad->sched-
>>>>> hang_limit)
>>>>>>>>>>>>>>         						if (entity-
>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>         	 * This iteration is thread safe as sched thread is
>>>> stopped.
>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>>         	list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent
>> &&
>>>>>>>>>>>>>>         		    dma_fence_remove_callback(s_job-
>>>>> s_fence-
>>>>>>>>> parent,
>>>>>>>>>>>>>>         					      &s_job->cb)) {
>>>>>>>>>>>>>>         			atomic_dec(&sched->hw_rq_count);
>>>> @@ -
>>>>>>>> 395,7
>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>         			 *
>>>>>>>>>>>>>>         			 * Job is still alive so fence refcount at
>>>> least 1
>>>>>>>>>>>>>>         			 */
>>>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence-
>>> finished,
>>>>>>>> false);
>>>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>>>> +				dma_fence_wait(&s_job-
>>> s_fence-
>>>>>>>>> finished,
>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>         			/*
>>>>>>>>>>>>>>         			 * We must keep bad job alive for later
>>>> use
>>>>>>>> during @@
>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>>> *sched,
>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>         	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>>         	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>> node)
>>>>>>>>>>>>> {
>>>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence-
>>> parent;
>>>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ?
>> s_job-
>>>>>>>>> s_fence-
>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>         		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>
>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-11  7:19                                                         ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-11  7:19 UTC (permalink / raw)
  To: Grodzovsky, Andrey, Koenig, Christian,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Andrey,
    I don’t think your patch will help for this. As it will may call kthread_should_park in drm_sched_cleanup_jobs first, and then call kcl_kthread_park. And then it still has a race between the 2 threads.

Best wishes
Emily Deng



>-----Original Message-----
>From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>Sent: Saturday, November 9, 2019 3:01 AM
>To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>
>On 11/8/19 5:35 AM, Koenig, Christian wrote:
>> Hi Emily,
>>
>> exactly that can't happen. See here:
>>
>>>          /* Don't destroy jobs while the timeout worker is running */
>>>          if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>              !cancel_delayed_work(&sched->work_tdr))
>>>                  return NULL;
>> We never free jobs while the timeout working is running to prevent
>> exactly that issue.
>
>
>I don't think this protects us if drm_sched_cleanup_jobs is called for scheduler
>which didn't experience a timeout, in amdgpu_device_gpu_recover we access
>sched->ring_mirror_list for all the schedulers on a device so this condition
>above won't protect us. What in fact could help maybe is my recent patch
>541c521 drm/sched: Avoid job cleanup if sched thread is parked. because we
>do park each of the scheduler threads during tdr job before trying to access
>sched->ring_mirror_list.
>
>Emily - did you see this problem with that patch in place ? I only pushed it
>yesterday.
>
>Andrey
>
>
>>
>> Regards,
>> Christian.
>>
>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>> Hi Christian,
>>>        The drm_sched_job_timedout-> amdgpu_job_timedout call
>amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>
>>> Best wishes
>>> Emily Deng
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>> Hi Emily,
>>>>
>>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>
>>>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>> Hi Chrisitan,
>>>>>         No, I am with the new branch and also has the patch. Even
>>>>> it are freed by
>>>> main scheduler, how we could avoid main scheduler to free jobs while
>>>> enter to function amdgpu_device_gpu_recover?
>>>>> Best wishes
>>>>> Emily Deng
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>> tdr
>>>>>>
>>>>>> Hi Emily,
>>>>>>
>>>>>> in this case you are on an old code branch.
>>>>>>
>>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>>> timeout handler is running.
>>>>>>
>>>>>> See this patch here:
>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>
>>>>>>>        drm/scheduler: rework job destruction
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>> Hi Christian,
>>>>>>>          Please refer to follow log, when it enter to
>>>>>>> amdgpu_device_gpu_recover
>>>>>> function, the bad job 000000005086879e is freeing in function
>>>>>> amdgpu_job_free_cb  at the same time, because of the hardware
>>>>>> fence
>>>> signal.
>>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>> s_fence is already freed, but job is not freed in time. Then this issue
>occurs.
>>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>> sdma0
>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>> 449.794163] amdgpu
>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>> 449.794221] Emily:amdgpu_job_free_cb,Process information: process
>>>>>>> pid 0 thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> thread pid 0,
>>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6
>>>>>> [ 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>> dereference at
>>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>>> 0000 [#1] SMP PTI
>>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>> 449.803488]
>>>> RIP:
>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>>>>> ff
>>>>>>> 45 85 e4 0f
>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>>> <48> 8b
>>>> 98
>>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000
>CR0:
>>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>[
>>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>> gfx@lists.freedesktop.org
>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>> tdr
>>>>>>>>
>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>> Sorry, please take your time.
>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>
>>>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>>>> NULL without the job also being freed.
>>>>>>>>
>>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>> Best wishes
>>>>>>>>> Emily Deng
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>> for tdr
>>>>>>>>>>
>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>> Ping.....
>>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>>
>>>>>>>>>>> Best wishes
>>>>>>>>>>> Emily Deng
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>>> Behalf
>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>
>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>
>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in
>>>>>>>>>>>> one case, when it enter into the amdgpu_device_gpu_recover,
>>>>>>>>>>>> it already in drm_sched_job_cleanup, and at this time, it
>>>>>>>>>>>> will go to free
>>>>>> job.
>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>
>>>>>>>>>>>              drm_sched_job_cleanup(s_job);
>>>>>>>>>>>
>>>>>>>>>>>              amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>              dma_fence_put(job->fence);
>>>>>>>>>>>              amdgpu_sync_free(&job->sync);
>>>>>>>>>>>              amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>              kfree(job);
>>>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>>> reference to the
>>>>>> s_fence.
>>>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>>>         drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-
>----
>>>>>>>>>>>>>>         2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>         	 *
>>>>>>>>>>>>>>         	 * job->base holds a reference to parent fence
>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>> +	if (job && job->base.s_fence &&
>>>>>>>>>>>>>> +job->base.s_fence->parent
>>>>>>>> &&
>>>>>>>>>>>>>>         	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>>>         		job_signaled = true;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>drm_sched_increase_karma(struct
>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>         			spin_lock(&rq->lock);
>>>>>>>>>>>>>>         			list_for_each_entry_safe(entity, tmp,
>>>> &rq-
>>>>>>>>> entities,
>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>> -				if (bad->s_fence-
>>scheduled.context
>>>>>>>> ==
>>>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>>>> +				if (bad->s_fence && (bad-
>>s_fence-
>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>>>         					if (atomic_read(&bad-
>>>>>>>>> karma) >
>>>>>>>>>>>>>>         					    bad->sched-
>>>>> hang_limit)
>>>>>>>>>>>>>>         						if (entity-
>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>         	 * This iteration is thread safe as sched thread
>>>>>>>>>>>>>> is
>>>> stopped.
>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>>         	list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent
>&&
>>>>>>>>>>>>>>         		    dma_fence_remove_callback(s_job-
>>>>> s_fence-
>>>>>>>>> parent,
>>>>>>>>>>>>>>         					      &s_job->cb)) {
>>>>>>>>>>>>>>         			atomic_dec(&sched->hw_rq_count);
>>>> @@ -
>>>>>>>> 395,7
>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>         			 *
>>>>>>>>>>>>>>         			 * Job is still alive so fence refcount at
>>>> least 1
>>>>>>>>>>>>>>         			 */
>>>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence-
>>finished,
>>>>>>>> false);
>>>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>>>> +				dma_fence_wait(&s_job-
>>s_fence-
>>>>>>>>> finished,
>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>         			/*
>>>>>>>>>>>>>>         			 * We must keep bad job alive for later
>>>> use
>>>>>>>> during @@
>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>>> *sched,
>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>         	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>>         	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>> node)
>>>>>>>>>>>>> {
>>>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence-
>>parent;
>>>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ?
>s_job-
>>>>>>>>> s_fence-
>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>         		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>
>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-11  7:19                                                         ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-11  7:19 UTC (permalink / raw)
  To: Grodzovsky, Andrey, Koenig, Christian, amd-gfx

Hi Andrey,
    I don’t think your patch will help for this. As it will may call kthread_should_park in drm_sched_cleanup_jobs first, and then call kcl_kthread_park. And then it still has a race between the 2 threads.

Best wishes
Emily Deng



>-----Original Message-----
>From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>Sent: Saturday, November 9, 2019 3:01 AM
>To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>
>On 11/8/19 5:35 AM, Koenig, Christian wrote:
>> Hi Emily,
>>
>> exactly that can't happen. See here:
>>
>>>          /* Don't destroy jobs while the timeout worker is running */
>>>          if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>              !cancel_delayed_work(&sched->work_tdr))
>>>                  return NULL;
>> We never free jobs while the timeout working is running to prevent
>> exactly that issue.
>
>
>I don't think this protects us if drm_sched_cleanup_jobs is called for scheduler
>which didn't experience a timeout, in amdgpu_device_gpu_recover we access
>sched->ring_mirror_list for all the schedulers on a device so this condition
>above won't protect us. What in fact could help maybe is my recent patch
>541c521 drm/sched: Avoid job cleanup if sched thread is parked. because we
>do park each of the scheduler threads during tdr job before trying to access
>sched->ring_mirror_list.
>
>Emily - did you see this problem with that patch in place ? I only pushed it
>yesterday.
>
>Andrey
>
>
>>
>> Regards,
>> Christian.
>>
>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>> Hi Christian,
>>>        The drm_sched_job_timedout-> amdgpu_job_timedout call
>amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>
>>> Best wishes
>>> Emily Deng
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>> Hi Emily,
>>>>
>>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>
>>>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>> Hi Chrisitan,
>>>>>         No, I am with the new branch and also has the patch. Even
>>>>> it are freed by
>>>> main scheduler, how we could avoid main scheduler to free jobs while
>>>> enter to function amdgpu_device_gpu_recover?
>>>>> Best wishes
>>>>> Emily Deng
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>> tdr
>>>>>>
>>>>>> Hi Emily,
>>>>>>
>>>>>> in this case you are on an old code branch.
>>>>>>
>>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>>> timeout handler is running.
>>>>>>
>>>>>> See this patch here:
>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>
>>>>>>>        drm/scheduler: rework job destruction
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>> Hi Christian,
>>>>>>>          Please refer to follow log, when it enter to
>>>>>>> amdgpu_device_gpu_recover
>>>>>> function, the bad job 000000005086879e is freeing in function
>>>>>> amdgpu_job_free_cb  at the same time, because of the hardware
>>>>>> fence
>>>> signal.
>>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>> s_fence is already freed, but job is not freed in time. Then this issue
>occurs.
>>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>> sdma0
>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>> 449.794163] amdgpu
>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>> 449.794221] Emily:amdgpu_job_free_cb,Process information: process
>>>>>>> pid 0 thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> thread pid 0,
>>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6
>>>>>> [ 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>> dereference at
>>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>>> 0000 [#1] SMP PTI
>>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>> 449.803488]
>>>> RIP:
>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>>>>> ff
>>>>>>> 45 85 e4 0f
>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>>> <48> 8b
>>>> 98
>>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000
>CR0:
>>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>[
>>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>> gfx@lists.freedesktop.org
>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>> tdr
>>>>>>>>
>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>> Sorry, please take your time.
>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>
>>>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>>>> NULL without the job also being freed.
>>>>>>>>
>>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>> Best wishes
>>>>>>>>> Emily Deng
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>> for tdr
>>>>>>>>>>
>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>> Ping.....
>>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>>
>>>>>>>>>>> Best wishes
>>>>>>>>>>> Emily Deng
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>>> Behalf
>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>
>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>
>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in
>>>>>>>>>>>> one case, when it enter into the amdgpu_device_gpu_recover,
>>>>>>>>>>>> it already in drm_sched_job_cleanup, and at this time, it
>>>>>>>>>>>> will go to free
>>>>>> job.
>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>
>>>>>>>>>>>              drm_sched_job_cleanup(s_job);
>>>>>>>>>>>
>>>>>>>>>>>              amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>              dma_fence_put(job->fence);
>>>>>>>>>>>              amdgpu_sync_free(&job->sync);
>>>>>>>>>>>              amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>              kfree(job);
>>>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>>> reference to the
>>>>>> s_fence.
>>>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>>>         drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-
>----
>>>>>>>>>>>>>>         2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>         	 *
>>>>>>>>>>>>>>         	 * job->base holds a reference to parent fence
>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>> +	if (job && job->base.s_fence &&
>>>>>>>>>>>>>> +job->base.s_fence->parent
>>>>>>>> &&
>>>>>>>>>>>>>>         	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>>>         		job_signaled = true;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>drm_sched_increase_karma(struct
>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>         			spin_lock(&rq->lock);
>>>>>>>>>>>>>>         			list_for_each_entry_safe(entity, tmp,
>>>> &rq-
>>>>>>>>> entities,
>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>> -				if (bad->s_fence-
>>scheduled.context
>>>>>>>> ==
>>>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>>>> +				if (bad->s_fence && (bad-
>>s_fence-
>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>>>         					if (atomic_read(&bad-
>>>>>>>>> karma) >
>>>>>>>>>>>>>>         					    bad->sched-
>>>>> hang_limit)
>>>>>>>>>>>>>>         						if (entity-
>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>         	 * This iteration is thread safe as sched thread
>>>>>>>>>>>>>> is
>>>> stopped.
>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>>         	list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent
>&&
>>>>>>>>>>>>>>         		    dma_fence_remove_callback(s_job-
>>>>> s_fence-
>>>>>>>>> parent,
>>>>>>>>>>>>>>         					      &s_job->cb)) {
>>>>>>>>>>>>>>         			atomic_dec(&sched->hw_rq_count);
>>>> @@ -
>>>>>>>> 395,7
>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>         			 *
>>>>>>>>>>>>>>         			 * Job is still alive so fence refcount at
>>>> least 1
>>>>>>>>>>>>>>         			 */
>>>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence-
>>finished,
>>>>>>>> false);
>>>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>>>> +				dma_fence_wait(&s_job-
>>s_fence-
>>>>>>>>> finished,
>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>         			/*
>>>>>>>>>>>>>>         			 * We must keep bad job alive for later
>>>> use
>>>>>>>> during @@
>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>>> *sched,
>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>         	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>>         	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>> node)
>>>>>>>>>>>>> {
>>>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence-
>>parent;
>>>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ?
>s_job-
>>>>>>>>> s_fence-
>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>         		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>
>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-11  9:05                                                             ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-11  9:05 UTC (permalink / raw)
  To: Deng, Emily, Grodzovsky, Andrey, Koenig, Christian,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Christian and Andrey,
     The issue I encountered is the bad job is freeing after entering to the amdgpu_device_gpu_recover. Don't know why, as per Christian said, it will call cancel_delayed_work in drm_sched_cleanup_jobs.

Best wishes
Emily Deng



>-----Original Message-----
>From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Deng,
>Emily
>Sent: Monday, November 11, 2019 3:19 PM
>To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Koenig, Christian
><Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Hi Andrey,
>    I don’t think your patch will help for this. As it will may call
>kthread_should_park in drm_sched_cleanup_jobs first, and then call
>kcl_kthread_park. And then it still has a race between the 2 threads.
>
>Best wishes
>Emily Deng
>
>
>
>>-----Original Message-----
>>From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>Sent: Saturday, November 9, 2019 3:01 AM
>>To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>>
>>On 11/8/19 5:35 AM, Koenig, Christian wrote:
>>> Hi Emily,
>>>
>>> exactly that can't happen. See here:
>>>
>>>>          /* Don't destroy jobs while the timeout worker is running
>>>> */
>>>>          if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>              !cancel_delayed_work(&sched->work_tdr))
>>>>                  return NULL;
>>> We never free jobs while the timeout working is running to prevent
>>> exactly that issue.
>>
>>
>>I don't think this protects us if drm_sched_cleanup_jobs is called for
>>scheduler which didn't experience a timeout, in
>>amdgpu_device_gpu_recover we access
>>sched->ring_mirror_list for all the schedulers on a device so this
>>sched->condition
>>above won't protect us. What in fact could help maybe is my recent
>>patch
>>541c521 drm/sched: Avoid job cleanup if sched thread is parked. because
>>we do park each of the scheduler threads during tdr job before trying
>>to access
>>sched->ring_mirror_list.
>>
>>Emily - did you see this problem with that patch in place ? I only
>>pushed it yesterday.
>>
>>Andrey
>>
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>> Hi Christian,
>>>>        The drm_sched_job_timedout-> amdgpu_job_timedout call
>>amdgpu_device_gpu_recover. I mean the main scheduler free the jobs
>>while in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>>
>>>> Best wishes
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Hi Emily,
>>>>>
>>>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>
>>>>> When it's not the scheduler we shouldn't have a guilty job in the first
>place.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>> Hi Chrisitan,
>>>>>>         No, I am with the new branch and also has the patch. Even
>>>>>> it are freed by
>>>>> main scheduler, how we could avoid main scheduler to free jobs
>>>>> while enter to function amdgpu_device_gpu_recover?
>>>>>> Best wishes
>>>>>> Emily Deng
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> tdr
>>>>>>>
>>>>>>> Hi Emily,
>>>>>>>
>>>>>>> in this case you are on an old code branch.
>>>>>>>
>>>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>>>> timeout handler is running.
>>>>>>>
>>>>>>> See this patch here:
>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>
>>>>>>>>        drm/scheduler: rework job destruction
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>> Hi Christian,
>>>>>>>>          Please refer to follow log, when it enter to
>>>>>>>> amdgpu_device_gpu_recover
>>>>>>> function, the bad job 000000005086879e is freeing in function
>>>>>>> amdgpu_job_free_cb  at the same time, because of the hardware
>>>>>>> fence
>>>>> signal.
>>>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>>> s_fence is already freed, but job is not freed in time. Then this
>>>>>>> issue
>>occurs.
>>>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>>> sdma0
>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process
>information:
>>>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>> 449.794163] amdgpu
>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>> 449.794221] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>> process pid 0 thread pid 0, s_job:0000000066eb74ab [
>>>>>>>> 449.794222] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>> process  pid 0 thread pid 0, s_job:00000000d4438ad9 [
>>>>>>>> 449.794255] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>> process  pid 0 thread pid 0, s_job:00000000b6d69c65 [
>>>>>>>> 449.794257] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>> process  pid 0 thread pid 0,
>>>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6
>>>>>>> [ 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>> dereference at
>>>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>>>> 0000 [#1] SMP PTI
>>>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G
>OE
>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>>> 449.803488]
>>>>> RIP:
>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff
>>>>>>>> ff ff
>>>>>>>> 45 85 e4 0f
>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>>>> <48> 8b
>>>>> 98
>>>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000
>[
>>>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES:
>>>>>>>> 0000
>>CR0:
>>>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>0000000000000000
>>[
>>>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>> gfx@lists.freedesktop.org
>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>> tdr
>>>>>>>>>
>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>> Sorry, please take your time.
>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>
>>>>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>
>>>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>> Best wishes
>>>>>>>>>> Emily Deng
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>
>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>> Ping.....
>>>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>>>
>>>>>>>>>>>> Best wishes
>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>>>> Behalf
>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>
>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is
>destroyed.
>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in
>>>>>>>>>>>>> one case, when it enter into the amdgpu_device_gpu_recover,
>>>>>>>>>>>>> it already in drm_sched_job_cleanup, and at this time, it
>>>>>>>>>>>>> will go to free
>>>>>>> job.
>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>
>>>>>>>>>>>>              drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>
>>>>>>>>>>>>              amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>              dma_fence_put(job->fence);
>>>>>>>>>>>>              amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>              amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>              kfree(job);
>>>>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>>>> reference to the
>>>>>>> s_fence.
>>>>>>>>>>> So you are just papering over a much bigger problem here.
>>>>>>>>>>> This patch is a clear NAK.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>>>>         drivers/gpu/drm/scheduler/sched_main.c     | 11
>++++++-
>>----
>>>>>>>>>>>>>>>         2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>         	 *
>>>>>>>>>>>>>>>         	 * job->base holds a reference to parent fence
>>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>> +	if (job && job->base.s_fence &&
>>>>>>>>>>>>>>> +job->base.s_fence->parent
>>>>>>>>> &&
>>>>>>>>>>>>>>>         	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>>>>         		job_signaled = true;
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>drm_sched_increase_karma(struct
>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>         			spin_lock(&rq->lock);
>>>>>>>>>>>>>>>         			list_for_each_entry_safe(entity, tmp,
>>>>> &rq-
>>>>>>>>>> entities,
>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>> -				if (bad->s_fence-
>>>scheduled.context
>>>>>>>>> ==
>>>>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>>>>> +				if (bad->s_fence && (bad-
>>>s_fence-
>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>>>>         					if (atomic_read(&bad-
>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>         					    bad->sched-
>>>>>> hang_limit)
>>>>>>>>>>>>>>>         						if (entity-
>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>         	 * This iteration is thread safe as sched thread
>>>>>>>>>>>>>>> is
>>>>> stopped.
>>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>>>         	list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent
>>&&
>>>>>>>>>>>>>>>         		    dma_fence_remove_callback(s_job-
>>>>>> s_fence-
>>>>>>>>>> parent,
>>>>>>>>>>>>>>>         					      &s_job->cb)) {
>>>>>>>>>>>>>>>         			atomic_dec(&sched->hw_rq_count);
>>>>> @@ -
>>>>>>>>> 395,7
>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>         			 *
>>>>>>>>>>>>>>>         			 * Job is still alive so fence refcount at
>>>>> least 1
>>>>>>>>>>>>>>>         			 */
>>>>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence-
>>>finished,
>>>>>>>>> false);
>>>>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>>>>> +				dma_fence_wait(&s_job-
>>>s_fence-
>>>>>>>>>> finished,
>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>         			/*
>>>>>>>>>>>>>>>         			 * We must keep bad job alive for later
>>>>> use
>>>>>>>>> during @@
>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>drm_gpu_scheduler
>>>>>>> *sched,
>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>         	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>>>         	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence-
>>>parent;
>>>>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ?
>>s_job-
>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>         		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>_______________________________________________
>amd-gfx mailing list
>amd-gfx@lists.freedesktop.org
>https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-11  9:05                                                             ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-11  9:05 UTC (permalink / raw)
  To: Deng, Emily, Grodzovsky, Andrey, Koenig, Christian, amd-gfx

Hi Christian and Andrey,
     The issue I encountered is the bad job is freeing after entering to the amdgpu_device_gpu_recover. Don't know why, as per Christian said, it will call cancel_delayed_work in drm_sched_cleanup_jobs.

Best wishes
Emily Deng



>-----Original Message-----
>From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Deng,
>Emily
>Sent: Monday, November 11, 2019 3:19 PM
>To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Koenig, Christian
><Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Hi Andrey,
>    I don’t think your patch will help for this. As it will may call
>kthread_should_park in drm_sched_cleanup_jobs first, and then call
>kcl_kthread_park. And then it still has a race between the 2 threads.
>
>Best wishes
>Emily Deng
>
>
>
>>-----Original Message-----
>>From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>Sent: Saturday, November 9, 2019 3:01 AM
>>To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>>
>>On 11/8/19 5:35 AM, Koenig, Christian wrote:
>>> Hi Emily,
>>>
>>> exactly that can't happen. See here:
>>>
>>>>          /* Don't destroy jobs while the timeout worker is running
>>>> */
>>>>          if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>              !cancel_delayed_work(&sched->work_tdr))
>>>>                  return NULL;
>>> We never free jobs while the timeout working is running to prevent
>>> exactly that issue.
>>
>>
>>I don't think this protects us if drm_sched_cleanup_jobs is called for
>>scheduler which didn't experience a timeout, in
>>amdgpu_device_gpu_recover we access
>>sched->ring_mirror_list for all the schedulers on a device so this
>>sched->condition
>>above won't protect us. What in fact could help maybe is my recent
>>patch
>>541c521 drm/sched: Avoid job cleanup if sched thread is parked. because
>>we do park each of the scheduler threads during tdr job before trying
>>to access
>>sched->ring_mirror_list.
>>
>>Emily - did you see this problem with that patch in place ? I only
>>pushed it yesterday.
>>
>>Andrey
>>
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>> Hi Christian,
>>>>        The drm_sched_job_timedout-> amdgpu_job_timedout call
>>amdgpu_device_gpu_recover. I mean the main scheduler free the jobs
>>while in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>>
>>>> Best wishes
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Hi Emily,
>>>>>
>>>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>
>>>>> When it's not the scheduler we shouldn't have a guilty job in the first
>place.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>> Hi Chrisitan,
>>>>>>         No, I am with the new branch and also has the patch. Even
>>>>>> it are freed by
>>>>> main scheduler, how we could avoid main scheduler to free jobs
>>>>> while enter to function amdgpu_device_gpu_recover?
>>>>>> Best wishes
>>>>>> Emily Deng
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> tdr
>>>>>>>
>>>>>>> Hi Emily,
>>>>>>>
>>>>>>> in this case you are on an old code branch.
>>>>>>>
>>>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>>>> timeout handler is running.
>>>>>>>
>>>>>>> See this patch here:
>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>
>>>>>>>>        drm/scheduler: rework job destruction
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>> Hi Christian,
>>>>>>>>          Please refer to follow log, when it enter to
>>>>>>>> amdgpu_device_gpu_recover
>>>>>>> function, the bad job 000000005086879e is freeing in function
>>>>>>> amdgpu_job_free_cb  at the same time, because of the hardware
>>>>>>> fence
>>>>> signal.
>>>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>>> s_fence is already freed, but job is not freed in time. Then this
>>>>>>> issue
>>occurs.
>>>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>>> sdma0
>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process
>information:
>>>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>> 449.794163] amdgpu
>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>> 449.794221] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>> process pid 0 thread pid 0, s_job:0000000066eb74ab [
>>>>>>>> 449.794222] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>> process  pid 0 thread pid 0, s_job:00000000d4438ad9 [
>>>>>>>> 449.794255] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>> process  pid 0 thread pid 0, s_job:00000000b6d69c65 [
>>>>>>>> 449.794257] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>> process  pid 0 thread pid 0,
>>>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6
>>>>>>> [ 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>> dereference at
>>>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>>>> 0000 [#1] SMP PTI
>>>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G
>OE
>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>>> 449.803488]
>>>>> RIP:
>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff
>>>>>>>> ff ff
>>>>>>>> 45 85 e4 0f
>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>>>> <48> 8b
>>>>> 98
>>>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000
>[
>>>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES:
>>>>>>>> 0000
>>CR0:
>>>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>0000000000000000
>>[
>>>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>> gfx@lists.freedesktop.org
>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>> tdr
>>>>>>>>>
>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>> Sorry, please take your time.
>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>
>>>>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>
>>>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>> Best wishes
>>>>>>>>>> Emily Deng
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>
>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>> Ping.....
>>>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>>>
>>>>>>>>>>>> Best wishes
>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>>>> Behalf
>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>
>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is
>destroyed.
>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in
>>>>>>>>>>>>> one case, when it enter into the amdgpu_device_gpu_recover,
>>>>>>>>>>>>> it already in drm_sched_job_cleanup, and at this time, it
>>>>>>>>>>>>> will go to free
>>>>>>> job.
>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>
>>>>>>>>>>>>              drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>
>>>>>>>>>>>>              amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>              dma_fence_put(job->fence);
>>>>>>>>>>>>              amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>              amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>              kfree(job);
>>>>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>>>> reference to the
>>>>>>> s_fence.
>>>>>>>>>>> So you are just papering over a much bigger problem here.
>>>>>>>>>>> This patch is a clear NAK.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>>>>         drivers/gpu/drm/scheduler/sched_main.c     | 11
>++++++-
>>----
>>>>>>>>>>>>>>>         2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>         	 *
>>>>>>>>>>>>>>>         	 * job->base holds a reference to parent fence
>>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>> +	if (job && job->base.s_fence &&
>>>>>>>>>>>>>>> +job->base.s_fence->parent
>>>>>>>>> &&
>>>>>>>>>>>>>>>         	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>>>>         		job_signaled = true;
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>drm_sched_increase_karma(struct
>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>         			spin_lock(&rq->lock);
>>>>>>>>>>>>>>>         			list_for_each_entry_safe(entity, tmp,
>>>>> &rq-
>>>>>>>>>> entities,
>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>> -				if (bad->s_fence-
>>>scheduled.context
>>>>>>>>> ==
>>>>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>>>>> +				if (bad->s_fence && (bad-
>>>s_fence-
>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>>>>         					if (atomic_read(&bad-
>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>         					    bad->sched-
>>>>>> hang_limit)
>>>>>>>>>>>>>>>         						if (entity-
>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>         	 * This iteration is thread safe as sched thread
>>>>>>>>>>>>>>> is
>>>>> stopped.
>>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>>>         	list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent
>>&&
>>>>>>>>>>>>>>>         		    dma_fence_remove_callback(s_job-
>>>>>> s_fence-
>>>>>>>>>> parent,
>>>>>>>>>>>>>>>         					      &s_job->cb)) {
>>>>>>>>>>>>>>>         			atomic_dec(&sched->hw_rq_count);
>>>>> @@ -
>>>>>>>>> 395,7
>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>         			 *
>>>>>>>>>>>>>>>         			 * Job is still alive so fence refcount at
>>>>> least 1
>>>>>>>>>>>>>>>         			 */
>>>>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence-
>>>finished,
>>>>>>>>> false);
>>>>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>>>>> +				dma_fence_wait(&s_job-
>>>s_fence-
>>>>>>>>>> finished,
>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>         			/*
>>>>>>>>>>>>>>>         			 * We must keep bad job alive for later
>>>>> use
>>>>>>>>> during @@
>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>drm_gpu_scheduler
>>>>>>> *sched,
>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>         	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>         	 */
>>>>>>>>>>>>>>>         	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence-
>>>parent;
>>>>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ?
>>s_job-
>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>         		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>_______________________________________________
>amd-gfx mailing list
>amd-gfx@lists.freedesktop.org
>https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-11 18:06                                                             ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-11 18:06 UTC (permalink / raw)
  To: Deng, Emily, Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Note that kthread_park waits for kthread->parked to be signaled before 
proceeding - so in the scenario you described it meas main thread is 
running (not parked and so kthread->parked is not signaled) and so 
kthread_park will not proceed until the sched thread finish current loop 
(including removing any signaled jobs from ring_mirror_list) and is back 
to 
wait_event_interruptible->drm_sched_blocked->kthread_parkme->complete(&self->parked) 
to park itself - so looks to me it should be OK.

Andrey


On 11/11/19 2:19 AM, Deng, Emily wrote:
> Hi Andrey,
>      I don’t think your patch will help for this. As it will may call kthread_should_park in drm_sched_cleanup_jobs first, and then call kcl_kthread_park. And then it still has a race between the 2 threads.
>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>> Sent: Saturday, November 9, 2019 3:01 AM
>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>>
>> On 11/8/19 5:35 AM, Koenig, Christian wrote:
>>> Hi Emily,
>>>
>>> exactly that can't happen. See here:
>>>
>>>>           /* Don't destroy jobs while the timeout worker is running */
>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>               !cancel_delayed_work(&sched->work_tdr))
>>>>                   return NULL;
>>> We never free jobs while the timeout working is running to prevent
>>> exactly that issue.
>>
>> I don't think this protects us if drm_sched_cleanup_jobs is called for scheduler
>> which didn't experience a timeout, in amdgpu_device_gpu_recover we access
>> sched->ring_mirror_list for all the schedulers on a device so this condition
>> above won't protect us. What in fact could help maybe is my recent patch
>> 541c521 drm/sched: Avoid job cleanup if sched thread is parked. because we
>> do park each of the scheduler threads during tdr job before trying to access
>> sched->ring_mirror_list.
>>
>> Emily - did you see this problem with that patch in place ? I only pushed it
>> yesterday.
>>
>> Andrey
>>
>>
>>> Regards,
>>> Christian.
>>>
>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>> Hi Christian,
>>>>         The drm_sched_job_timedout-> amdgpu_job_timedout call
>> amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>> in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>> Best wishes
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Hi Emily,
>>>>>
>>>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>
>>>>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>> Hi Chrisitan,
>>>>>>          No, I am with the new branch and also has the patch. Even
>>>>>> it are freed by
>>>>> main scheduler, how we could avoid main scheduler to free jobs while
>>>>> enter to function amdgpu_device_gpu_recover?
>>>>>> Best wishes
>>>>>> Emily Deng
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> tdr
>>>>>>>
>>>>>>> Hi Emily,
>>>>>>>
>>>>>>> in this case you are on an old code branch.
>>>>>>>
>>>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>>>> timeout handler is running.
>>>>>>>
>>>>>>> See this patch here:
>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>
>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>> Hi Christian,
>>>>>>>>           Please refer to follow log, when it enter to
>>>>>>>> amdgpu_device_gpu_recover
>>>>>>> function, the bad job 000000005086879e is freeing in function
>>>>>>> amdgpu_job_free_cb  at the same time, because of the hardware
>>>>>>> fence
>>>>> signal.
>>>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>>> s_fence is already freed, but job is not freed in time. Then this issue
>> occurs.
>>>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>>> sdma0
>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>> 449.794163] amdgpu
>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>> 449.794221] Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>> pid 0 thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>>> thread pid 0,
>>>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6
>>>>>>> [ 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>> dereference at
>>>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>>>> 0000 [#1] SMP PTI
>>>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>>> 449.803488]
>>>>> RIP:
>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>>>>>> ff
>>>>>>>> 45 85 e4 0f
>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>>>> <48> 8b
>>>>> 98
>>>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000
>> CR0:
>>>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>> [
>>>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>> gfx@lists.freedesktop.org
>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>> tdr
>>>>>>>>>
>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>> Sorry, please take your time.
>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>
>>>>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>
>>>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>> Best wishes
>>>>>>>>>> Emily Deng
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>
>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>> Ping.....
>>>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>>>
>>>>>>>>>>>> Best wishes
>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>>>> Behalf
>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>
>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in
>>>>>>>>>>>>> one case, when it enter into the amdgpu_device_gpu_recover,
>>>>>>>>>>>>> it already in drm_sched_job_cleanup, and at this time, it
>>>>>>>>>>>>> will go to free
>>>>>>> job.
>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>
>>>>>>>>>>>>               drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>
>>>>>>>>>>>>               amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>               dma_fence_put(job->fence);
>>>>>>>>>>>>               amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>               amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>               kfree(job);
>>>>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>>>> reference to the
>>>>>>> s_fence.
>>>>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>          drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>>>>          drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-
>> ----
>>>>>>>>>>>>>>>          2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>          	 *
>>>>>>>>>>>>>>>          	 * job->base holds a reference to parent fence
>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>> +	if (job && job->base.s_fence &&
>>>>>>>>>>>>>>> +job->base.s_fence->parent
>>>>>>>>> &&
>>>>>>>>>>>>>>>          	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>>>>          		job_signaled = true;
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>          			spin_lock(&rq->lock);
>>>>>>>>>>>>>>>          			list_for_each_entry_safe(entity, tmp,
>>>>> &rq-
>>>>>>>>>> entities,
>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>> -				if (bad->s_fence-
>>> scheduled.context
>>>>>>>>> ==
>>>>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>>>>> +				if (bad->s_fence && (bad-
>>> s_fence-
>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>>>>          					if (atomic_read(&bad-
>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>          					    bad->sched-
>>>>>> hang_limit)
>>>>>>>>>>>>>>>          						if (entity-
>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>          	 * This iteration is thread safe as sched thread
>>>>>>>>>>>>>>> is
>>>>> stopped.
>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>>          	list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent
>> &&
>>>>>>>>>>>>>>>          		    dma_fence_remove_callback(s_job-
>>>>>> s_fence-
>>>>>>>>>> parent,
>>>>>>>>>>>>>>>          					      &s_job->cb)) {
>>>>>>>>>>>>>>>          			atomic_dec(&sched->hw_rq_count);
>>>>> @@ -
>>>>>>>>> 395,7
>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>          			 *
>>>>>>>>>>>>>>>          			 * Job is still alive so fence refcount at
>>>>> least 1
>>>>>>>>>>>>>>>          			 */
>>>>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence-
>>> finished,
>>>>>>>>> false);
>>>>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>>>>> +				dma_fence_wait(&s_job-
>>> s_fence-
>>>>>>>>>> finished,
>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>          			/*
>>>>>>>>>>>>>>>          			 * We must keep bad job alive for later
>>>>> use
>>>>>>>>> during @@
>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>>>> *sched,
>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>          	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>>          	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence-
>>> parent;
>>>>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ?
>> s_job-
>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>          		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-11 18:06                                                             ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-11 18:06 UTC (permalink / raw)
  To: Deng, Emily, Koenig, Christian, amd-gfx

Note that kthread_park waits for kthread->parked to be signaled before 
proceeding - so in the scenario you described it meas main thread is 
running (not parked and so kthread->parked is not signaled) and so 
kthread_park will not proceed until the sched thread finish current loop 
(including removing any signaled jobs from ring_mirror_list) and is back 
to 
wait_event_interruptible->drm_sched_blocked->kthread_parkme->complete(&self->parked) 
to park itself - so looks to me it should be OK.

Andrey


On 11/11/19 2:19 AM, Deng, Emily wrote:
> Hi Andrey,
>      I don’t think your patch will help for this. As it will may call kthread_should_park in drm_sched_cleanup_jobs first, and then call kcl_kthread_park. And then it still has a race between the 2 threads.
>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>> Sent: Saturday, November 9, 2019 3:01 AM
>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>>
>> On 11/8/19 5:35 AM, Koenig, Christian wrote:
>>> Hi Emily,
>>>
>>> exactly that can't happen. See here:
>>>
>>>>           /* Don't destroy jobs while the timeout worker is running */
>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>               !cancel_delayed_work(&sched->work_tdr))
>>>>                   return NULL;
>>> We never free jobs while the timeout working is running to prevent
>>> exactly that issue.
>>
>> I don't think this protects us if drm_sched_cleanup_jobs is called for scheduler
>> which didn't experience a timeout, in amdgpu_device_gpu_recover we access
>> sched->ring_mirror_list for all the schedulers on a device so this condition
>> above won't protect us. What in fact could help maybe is my recent patch
>> 541c521 drm/sched: Avoid job cleanup if sched thread is parked. because we
>> do park each of the scheduler threads during tdr job before trying to access
>> sched->ring_mirror_list.
>>
>> Emily - did you see this problem with that patch in place ? I only pushed it
>> yesterday.
>>
>> Andrey
>>
>>
>>> Regards,
>>> Christian.
>>>
>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>> Hi Christian,
>>>>         The drm_sched_job_timedout-> amdgpu_job_timedout call
>> amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>> in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>> Best wishes
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Hi Emily,
>>>>>
>>>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>
>>>>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>> Hi Chrisitan,
>>>>>>          No, I am with the new branch and also has the patch. Even
>>>>>> it are freed by
>>>>> main scheduler, how we could avoid main scheduler to free jobs while
>>>>> enter to function amdgpu_device_gpu_recover?
>>>>>> Best wishes
>>>>>> Emily Deng
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> tdr
>>>>>>>
>>>>>>> Hi Emily,
>>>>>>>
>>>>>>> in this case you are on an old code branch.
>>>>>>>
>>>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>>>> timeout handler is running.
>>>>>>>
>>>>>>> See this patch here:
>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>
>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>> Hi Christian,
>>>>>>>>           Please refer to follow log, when it enter to
>>>>>>>> amdgpu_device_gpu_recover
>>>>>>> function, the bad job 000000005086879e is freeing in function
>>>>>>> amdgpu_job_free_cb  at the same time, because of the hardware
>>>>>>> fence
>>>>> signal.
>>>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>>> s_fence is already freed, but job is not freed in time. Then this issue
>> occurs.
>>>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>>> sdma0
>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>> 449.794163] amdgpu
>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>> 449.794221] Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>> pid 0 thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>>> thread pid 0,
>>>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6
>>>>>>> [ 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>> dereference at
>>>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>>>> 0000 [#1] SMP PTI
>>>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>>> 449.803488]
>>>>> RIP:
>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>>>>>> ff
>>>>>>>> 45 85 e4 0f
>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>>>> <48> 8b
>>>>> 98
>>>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000
>> CR0:
>>>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>> [
>>>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>> gfx@lists.freedesktop.org
>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>> tdr
>>>>>>>>>
>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>> Sorry, please take your time.
>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>
>>>>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>
>>>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>> Best wishes
>>>>>>>>>> Emily Deng
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>
>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>> Ping.....
>>>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>>>
>>>>>>>>>>>> Best wishes
>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>>>> Behalf
>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>
>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in
>>>>>>>>>>>>> one case, when it enter into the amdgpu_device_gpu_recover,
>>>>>>>>>>>>> it already in drm_sched_job_cleanup, and at this time, it
>>>>>>>>>>>>> will go to free
>>>>>>> job.
>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>
>>>>>>>>>>>>               drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>
>>>>>>>>>>>>               amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>               dma_fence_put(job->fence);
>>>>>>>>>>>>               amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>               amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>               kfree(job);
>>>>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>>>> reference to the
>>>>>>> s_fence.
>>>>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>          drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>>>>          drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-
>> ----
>>>>>>>>>>>>>>>          2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>          	 *
>>>>>>>>>>>>>>>          	 * job->base holds a reference to parent fence
>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>> +	if (job && job->base.s_fence &&
>>>>>>>>>>>>>>> +job->base.s_fence->parent
>>>>>>>>> &&
>>>>>>>>>>>>>>>          	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>>>>          		job_signaled = true;
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>          			spin_lock(&rq->lock);
>>>>>>>>>>>>>>>          			list_for_each_entry_safe(entity, tmp,
>>>>> &rq-
>>>>>>>>>> entities,
>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>> -				if (bad->s_fence-
>>> scheduled.context
>>>>>>>>> ==
>>>>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>>>>> +				if (bad->s_fence && (bad-
>>> s_fence-
>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>>>>          					if (atomic_read(&bad-
>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>          					    bad->sched-
>>>>>> hang_limit)
>>>>>>>>>>>>>>>          						if (entity-
>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>          	 * This iteration is thread safe as sched thread
>>>>>>>>>>>>>>> is
>>>>> stopped.
>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>>          	list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent
>> &&
>>>>>>>>>>>>>>>          		    dma_fence_remove_callback(s_job-
>>>>>> s_fence-
>>>>>>>>>> parent,
>>>>>>>>>>>>>>>          					      &s_job->cb)) {
>>>>>>>>>>>>>>>          			atomic_dec(&sched->hw_rq_count);
>>>>> @@ -
>>>>>>>>> 395,7
>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>          			 *
>>>>>>>>>>>>>>>          			 * Job is still alive so fence refcount at
>>>>> least 1
>>>>>>>>>>>>>>>          			 */
>>>>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence-
>>> finished,
>>>>>>>>> false);
>>>>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>>>>> +				dma_fence_wait(&s_job-
>>> s_fence-
>>>>>>>>>> finished,
>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>          			/*
>>>>>>>>>>>>>>>          			 * We must keep bad job alive for later
>>>>> use
>>>>>>>>> during @@
>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>>>> *sched,
>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>          	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>>          	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence-
>>> parent;
>>>>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ?
>> s_job-
>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>          		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-11 21:11                                                             ` Christian König
  0 siblings, 0 replies; 80+ messages in thread
From: Christian König @ 2019-11-11 21:11 UTC (permalink / raw)
  To: Deng, Emily, Grodzovsky, Andrey,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 22251 bytes --]

Hi Emily,

you need to print which scheduler instance is freeing the jobs and which 
one is triggering the reset. The TID and PID is completely meaningless 
here since we are called from different worker threads and the TID/PID 
can change on each call.

Apart from that I will look into this a bit deeper when I have time.

Regards,
Christian.

Am 12.11.19 um 07:02 schrieb Deng, Emily:
> Hi Christian,
>     I add the follow print in function drm_sched_cleanup_jobs. From 
> the log it shows that only use cancel_delayed_work could not avoid to 
> free job when the sched is in reset. But don’t know exactly where it 
> is wrong about the driver. Do you have any suggestion about this?
> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, pid:%lu\n", 
> current->tgid, current->pid);
>         /*
>          * Don't destroy jobs while the timeout worker is running  OR 
> thread
>          * is being parked and hence assumed to not touch ring_mirror_list
>          */
>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
> !cancel_delayed_work(&sched->work_tdr)))
>                 return;
> +       printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
> current->tgid, current->pid);
> Best wishes
> Emily Deng
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0 
> timeout, signaled seq=78585, emitted seq=78587
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
> information: process  pid 0 thread  pid 0, 
> s_job:00000000fe75ab36,tid=15603, pid=15603
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225428] Emily:amdgpu_job_free_cb,Process information: process  
> pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, pid:2253
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225494] Emily:amdgpu_job_free_cb,Process information: process  
> pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, pid:2262
> >-----Original Message-----
> >From: Grodzovsky, Andrey <Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
> >Sent: Tuesday, November 12, 2019 11:28 AM
> >To: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>; Deng, Emily
> ><Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
> >
> >Thinking more about this claim - we assume here that if cancel_delayed_work
> >returned true it guarantees that timeout work is not running but, it merely
> >means there was a pending timeout work which was removed from the
> >workqueue before it's timer elapsed and so it didn't have a chance to be
> >dequeued and executed, it doesn't cover already executing work. So there is a
> >possibility where while timeout work started executing another timeout work
> >already got enqueued (maybe through earlier cleanup jobs or through
> >drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs
> >cancel_delayed_work(&sched->work_tdr) will return true even while there is a
> >timeout job in progress.
> >Unfortunately we cannot change cancel_delayed_work to
> >cancel_delayed_work_sync to flush the timeout work as timeout work itself
> >waits for schedule thread  to be parked again when calling park_thread.
> >
> >Andrey
> >
> >________________________________________
> >From: amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> on behalf of
> >Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
> >Sent: 08 November 2019 05:35:18
> >To: Deng, Emily; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
> >
> >Hi Emily,
> >
> >exactly that can't happen. See here:
> >
> >>         /* Don't destroy jobs while the timeout worker is running */
> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
> >>            !cancel_delayed_work(&sched->work_tdr))
> >>                 return NULL;
> >
> >We never free jobs while the timeout working is running to prevent exactly
> >that issue.
> >
> >Regards,
> >Christian.
> >
> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
> >> Hi Christian,
> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
> >amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
> >>
> >> Best wishes
> >> Emily Deng
> >>
> >>
> >>
> >>> -----Original Message-----
> >>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
> >>> Sent: Friday, November 8, 2019 6:26 PM
> >>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
> >>>
> >>> Hi Emily,
> >>>
> >>> well who is calling amdgpu_device_gpu_recover() in this case?
> >>>
> >>> When it's not the scheduler we shouldn't have a guilty job in the first place.
> >>>
> >>> Regards,
> >>> Christian.
> >>>
> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
> >>>> Hi Chrisitan,
> >>>>        No, I am with the new branch and also has the patch. Even it
> >>>> are freed by
> >>> main scheduler, how we could avoid main scheduler to free jobs while
> >>> enter to function amdgpu_device_gpu_recover?
> >>>> Best wishes
> >>>> Emily Deng
> >>>>
> >>>>
> >>>>
> >>>>> -----Original Message-----
> >>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
> >>>>> Sent: Friday, November 8, 2019 6:15 PM
> >>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
> >gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
> >>>>>
> >>>>> Hi Emily,
> >>>>>
> >>>>> in this case you are on an old code branch.
> >>>>>
> >>>>> Jobs are freed now by the main scheduler thread and only if no
> >>>>> timeout handler is running.
> >>>>>
> >>>>> See this patch here:
> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
> >>>>>> Author: Christian König <christian.koenig-5C7GfCeVMHo@public.gmane.org>
> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
> >>>>>>
> >>>>>>       drm/scheduler: rework job destruction
> >>>>> Regards,
> >>>>> Christian.
> >>>>>
> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
> >>>>>> Hi Christian,
> >>>>>>         Please refer to follow log, when it enter to
> >>>>>> amdgpu_device_gpu_recover
> >>>>> function, the bad job 000000005086879e is freeing in function
> >>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
> >>> signal.
> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
> >>>>> s_fence is already freed, but job is not freed in time. Then this issue
> >occurs.
> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
> >>> sdma0
> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
> >>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
> >>>>> amdgpu
> >>>>> 0000:00:08.0: GPU reset begin!
> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
> >>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
> >>>>>> thread pid 0,
> >>>>> s_job:00000000ea85e922 [ 449.794287]
> >>>>> Emily:amdgpu_job_free_cb,Process
> >>>>> information: process  pid 0 thread pid 0, s_job:00000000ed3a5ac6 [
> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
> >>>>> at
> >>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
> >>>>> 0000 [#1] SMP PTI
> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
> >>>>>> 449.803488]
> >>> RIP:
> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
> >>>>>> ff
> >>>>>> 45 85 e4 0f
> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
> >>>>> <48> 8b
> >>> 98
> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
> >>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
> >>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
> >>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000 ES: 0000
> >CR0:
> >>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
> >>>>> 0000000000000400 [  449.811937] Call Trace:
> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
> >>>>>> worker_thread+0x34/0x410 [ 449.814728]  kthread+0x121/0x140 [
> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
> >>>>>>
> >>>>>>> -----Original Message-----
> >>>>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
> >>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
> >>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
> >>>>>>> tdr
> >>>>>>>
> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
> >>>>>>>> Sorry, please take your time.
> >>>>>>> Have you seen my other response a bit below?
> >>>>>>>
> >>>>>>> I can't follow how it would be possible for job->s_fence to be
> >>>>>>> NULL without the job also being freed.
> >>>>>>>
> >>>>>>> So it looks like this patch is just papering over some bigger issues.
> >>>>>>>
> >>>>>>> Regards,
> >>>>>>> Christian.
> >>>>>>>
> >>>>>>>> Best wishes
> >>>>>>>> Emily Deng
> >>>>>>>>
> >>>>>>>>
> >>>>>>>>
> >>>>>>>>> -----Original Message-----
> >>>>>>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
> >>>>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
> >>>>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
> >>>>>>>>> tdr
> >>>>>>>>>
> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
> >>>>>>>>>> Ping.....
> >>>>>>>>> You need to give me at least enough time to wake up :)
> >>>>>>>>>
> >>>>>>>>>> Best wishes
> >>>>>>>>>> Emily Deng
> >>>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>>> -----Original Message-----
> >>>>>>>>>>> From: amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> On
> >>> Behalf
> >>>>>>>>>>> Of Deng, Emily
> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>; amd-
> >>>>>>>>>>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
> >>>>>>>>>>> for tdr
> >>>>>>>>>>>
> >>>>>>>>>>>> -----Original Message-----
> >>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>;
> >>>>>>>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
> >>>>>>>>>>>> for tdr
> >>>>>>>>>>>>
> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
> >>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
> >>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
> >>>>>>>>>>>> See drm_sched_job_cleanup().
> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
> >>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
> >>>>>>>>>>> go to free
> >>>>> job.
> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
> >>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
> >>>>>>>>> No, that case can't happen. See here:
> >>>>>>>>>
> >>>>>>>>>>            drm_sched_job_cleanup(s_job);
> >>>>>>>>>>
> >>>>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
> >>>>>>>>>>            dma_fence_put(job->fence);
> >>>>>>>>>>            amdgpu_sync_free(&job->sync);
> >>>>>>>>>>            amdgpu_sync_free(&job->sched_sync);
> >>>>>>>>>>            kfree(job);
> >>>>>>>>> The job itself is freed up directly after freeing the reference
> >>>>>>>>> to the
> >>>>> s_fence.
> >>>>>>>>> So you are just papering over a much bigger problem here. This
> >>>>>>>>> patch is a clear NAK.
> >>>>>>>>>
> >>>>>>>>> Regards,
> >>>>>>>>> Christian.
> >>>>>>>>>
> >>>>>>>>>>>> When you see a job without an s_fence then that means the
> >>>>>>>>>>>> problem is somewhere else.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Regards,
> >>>>>>>>>>>> Christian.
> >>>>>>>>>>>>
> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng-5C7GfCeVMHo@public.gmane.org>
> >>>>>>>>>>>>> ---
> >>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
> >>>>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
> >--
> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
> >>> amdgpu_device_gpu_recover(struct
> >>>>>>>>>>>> amdgpu_device *adev,
> >>>>>>>>>>>>>            *
> >>>>>>>>>>>>>            * job->base holds a reference to parent fence
> >>>>>>>>>>>>>            */
> >>>>>>>>>>>>> - if (job && job->base.s_fence->parent &&
> >>>>>>>>>>>>> + if (job && job->base.s_fence &&
> >>>>>>>>>>>>> + job->base.s_fence->parent
> >>>>>>> &&
> >>>>>>>>>>>>>               dma_fence_is_signaled(job->base.s_fence->parent))
> >>>>>>>>>>>>>                   job_signaled = true;
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
> >drm_sched_increase_karma(struct
> >>>>>>>>>>>> drm_sched_job
> >>>>>>>>>>>>> *bad)
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>                           spin_lock(&rq->lock);
> >>>>>>>>>>>>>                           list_for_each_entry_safe(entity,
> >>>>>>>>>>>>> tmp,
> >>> &rq-
> >>>>>>>> entities,
> >>>>>>>>>>>> list) {
> >>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
> >>>>>>> ==
> >>>>>>>>>>>>> -                              entity->fence_context) {
> >>>>>>>>>>>>> +                          if (bad->s_fence &&
> >>>>>>>>>>>>> + (bad->s_fence-
> >>>>>>>>>>>>> scheduled.context ==
> >>>>>>>>>>>>> +                              entity->fence_context)) {
> >>>>>>>>>>>>>                                           if
> >>>>>>>>>>>>> (atomic_read(&bad-
> >>>>>>>> karma) >
> >>>>>>>>>>>>>                                               bad->sched-
> >>>> hang_limit)
> >>>>>>>>>>>>>                                                   if
> >>>>>>>>>>>>> (entity-
> >>>> guilty) @@ -376,7 +376,7 @@ void
> >>>>>>>>>>>>> drm_sched_stop(struct
> >>>>>>> drm_gpu_scheduler
> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
> >>>>>>>>>>>>>            * This iteration is thread safe as sched thread
> >>>>>>>>>>>>> is
> >>> stopped.
> >>>>>>>>>>>>>            */
> >>>>>>>>>>>>>           list_for_each_entry_safe_reverse(s_job, tmp,
> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
> >>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
> >>>>>>>>>>>>>                       dma_fence_remove_callback(s_job-
> >>>> s_fence-
> >>>>>>>> parent,
> >>>>>>>>>>>>>                                                 &s_job->cb)) {
> >>>>>>>>>>>>>                           atomic_dec(&sched->hw_rq_count);
> >>> @@ -
> >>>>>>> 395,7
> >>>>>>>>>>> +395,8 @@ void
> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
> >>>>>>>>>>>>>                            *
> >>>>>>>>>>>>>                            * Job is still alive so fence
> >>>>>>>>>>>>> refcount at
> >>> least 1
> >>>>>>>>>>>>>                            */
> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
> >>>>>>> false);
> >>>>>>>>>>>>> +                  if (s_job->s_fence)
> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
> >>>>>>>> finished,
> >>>>>>>>>>>> false);
> >>>>>>>>>>>>>                           /*
> >>>>>>>>>>>>>                            * We must keep bad job alive
> >>>>>>>>>>>>> for later
> >>> use
> >>>>>>> during @@
> >>>>>>>>>>>> -438,7
> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
> >>>>> *sched,
> >>>>>>>>>>>>> +bool
> >>>>>>>>>>>> full_recovery)
> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
> >>>>>>>>>>>>>            */
> >>>>>>>>>>>>>           list_for_each_entry_safe(s_job, tmp,
> >>>>>>>>>>>>> &sched->ring_mirror_list,
> >>>>>>>>>>>>> node)
> >>>>>>>>>>>> {
> >>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
> >>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ?
> >>>>>>>>>>>>> + s_job-
> >>>>>>>> s_fence-
> >>>>>>>>>>>>> parent :
> >>>>>>>>>>>>> +NULL;
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>                   atomic_inc(&sched->hw_rq_count);
> >>>>>>>>>>>>>
> >>>>>>>>>>> _______________________________________________
> >>>>>>>>>>> amd-gfx mailing list
> >>>>>>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> >>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
> >
> >_______________________________________________
> >amd-gfx mailing list
> >amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[-- Attachment #1.2: Type: text/html, Size: 52798 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-11 21:11                                                             ` Christian König
  0 siblings, 0 replies; 80+ messages in thread
From: Christian König @ 2019-11-11 21:11 UTC (permalink / raw)
  To: Deng, Emily, Grodzovsky, Andrey, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 21549 bytes --]

Hi Emily,

you need to print which scheduler instance is freeing the jobs and which 
one is triggering the reset. The TID and PID is completely meaningless 
here since we are called from different worker threads and the TID/PID 
can change on each call.

Apart from that I will look into this a bit deeper when I have time.

Regards,
Christian.

Am 12.11.19 um 07:02 schrieb Deng, Emily:
> Hi Christian,
>     I add the follow print in function drm_sched_cleanup_jobs. From 
> the log it shows that only use cancel_delayed_work could not avoid to 
> free job when the sched is in reset. But don’t know exactly where it 
> is wrong about the driver. Do you have any suggestion about this?
> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, pid:%lu\n", 
> current->tgid, current->pid);
>         /*
>          * Don't destroy jobs while the timeout worker is running  OR 
> thread
>          * is being parked and hence assumed to not touch ring_mirror_list
>          */
>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
> !cancel_delayed_work(&sched->work_tdr)))
>                 return;
> +       printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
> current->tgid, current->pid);
> Best wishes
> Emily Deng
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0 
> timeout, signaled seq=78585, emitted seq=78587
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
> information: process  pid 0 thread  pid 0, 
> s_job:00000000fe75ab36,tid=15603, pid=15603
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225428] Emily:amdgpu_job_free_cb,Process information: process  
> pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, pid:2253
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
> [11381.225494] Emily:amdgpu_job_free_cb,Process information: process  
> pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, pid:2262
> >-----Original Message-----
> >From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> >Sent: Tuesday, November 12, 2019 11:28 AM
> >To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
> ><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
> >
> >Thinking more about this claim - we assume here that if cancel_delayed_work
> >returned true it guarantees that timeout work is not running but, it merely
> >means there was a pending timeout work which was removed from the
> >workqueue before it's timer elapsed and so it didn't have a chance to be
> >dequeued and executed, it doesn't cover already executing work. So there is a
> >possibility where while timeout work started executing another timeout work
> >already got enqueued (maybe through earlier cleanup jobs or through
> >drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs
> >cancel_delayed_work(&sched->work_tdr) will return true even while there is a
> >timeout job in progress.
> >Unfortunately we cannot change cancel_delayed_work to
> >cancel_delayed_work_sync to flush the timeout work as timeout work itself
> >waits for schedule thread  to be parked again when calling park_thread.
> >
> >Andrey
> >
> >________________________________________
> >From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of
> >Koenig, Christian <Christian.Koenig@amd.com>
> >Sent: 08 November 2019 05:35:18
> >To: Deng, Emily; amd-gfx@lists.freedesktop.org
> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
> >
> >Hi Emily,
> >
> >exactly that can't happen. See here:
> >
> >>         /* Don't destroy jobs while the timeout worker is running */
> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
> >>            !cancel_delayed_work(&sched->work_tdr))
> >>                 return NULL;
> >
> >We never free jobs while the timeout working is running to prevent exactly
> >that issue.
> >
> >Regards,
> >Christian.
> >
> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
> >> Hi Christian,
> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
> >amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
> >>
> >> Best wishes
> >> Emily Deng
> >>
> >>
> >>
> >>> -----Original Message-----
> >>> From: Koenig, Christian <Christian.Koenig@amd.com>
> >>> Sent: Friday, November 8, 2019 6:26 PM
> >>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
> >>>
> >>> Hi Emily,
> >>>
> >>> well who is calling amdgpu_device_gpu_recover() in this case?
> >>>
> >>> When it's not the scheduler we shouldn't have a guilty job in the first place.
> >>>
> >>> Regards,
> >>> Christian.
> >>>
> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
> >>>> Hi Chrisitan,
> >>>>        No, I am with the new branch and also has the patch. Even it
> >>>> are freed by
> >>> main scheduler, how we could avoid main scheduler to free jobs while
> >>> enter to function amdgpu_device_gpu_recover?
> >>>> Best wishes
> >>>> Emily Deng
> >>>>
> >>>>
> >>>>
> >>>>> -----Original Message-----
> >>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
> >>>>> Sent: Friday, November 8, 2019 6:15 PM
> >>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
> >gfx@lists.freedesktop.org
> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
> >>>>>
> >>>>> Hi Emily,
> >>>>>
> >>>>> in this case you are on an old code branch.
> >>>>>
> >>>>> Jobs are freed now by the main scheduler thread and only if no
> >>>>> timeout handler is running.
> >>>>>
> >>>>> See this patch here:
> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
> >>>>>> Author: Christian König <christian.koenig@amd.com>
> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
> >>>>>>
> >>>>>>       drm/scheduler: rework job destruction
> >>>>> Regards,
> >>>>> Christian.
> >>>>>
> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
> >>>>>> Hi Christian,
> >>>>>>         Please refer to follow log, when it enter to
> >>>>>> amdgpu_device_gpu_recover
> >>>>> function, the bad job 000000005086879e is freeing in function
> >>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
> >>> signal.
> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
> >>>>> s_fence is already freed, but job is not freed in time. Then this issue
> >occurs.
> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
> >>> sdma0
> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
> >>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
> >>>>> amdgpu
> >>>>> 0000:00:08.0: GPU reset begin!
> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
> >>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
> >>>>>> thread pid 0,
> >>>>> s_job:00000000ea85e922 [ 449.794287]
> >>>>> Emily:amdgpu_job_free_cb,Process
> >>>>> information: process  pid 0 thread pid 0, s_job:00000000ed3a5ac6 [
> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
> >>>>> at
> >>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
> >>>>> 0000 [#1] SMP PTI
> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
> >>>>>> 449.803488]
> >>> RIP:
> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
> >>>>>> ff
> >>>>>> 45 85 e4 0f
> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
> >>>>> <48> 8b
> >>> 98
> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
> >>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
> >>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
> >>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000 ES: 0000
> >CR0:
> >>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
> >>>>> 0000000000000400 [  449.811937] Call Trace:
> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
> >>>>>> worker_thread+0x34/0x410 [ 449.814728]  kthread+0x121/0x140 [
> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
> >>>>>>
> >>>>>>> -----Original Message-----
> >>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
> >>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
> >>> gfx@lists.freedesktop.org
> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
> >>>>>>> tdr
> >>>>>>>
> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
> >>>>>>>> Sorry, please take your time.
> >>>>>>> Have you seen my other response a bit below?
> >>>>>>>
> >>>>>>> I can't follow how it would be possible for job->s_fence to be
> >>>>>>> NULL without the job also being freed.
> >>>>>>>
> >>>>>>> So it looks like this patch is just papering over some bigger issues.
> >>>>>>>
> >>>>>>> Regards,
> >>>>>>> Christian.
> >>>>>>>
> >>>>>>>> Best wishes
> >>>>>>>> Emily Deng
> >>>>>>>>
> >>>>>>>>
> >>>>>>>>
> >>>>>>>>> -----Original Message-----
> >>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
> >>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
> >>>>> gfx@lists.freedesktop.org
> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
> >>>>>>>>> tdr
> >>>>>>>>>
> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
> >>>>>>>>>> Ping.....
> >>>>>>>>> You need to give me at least enough time to wake up :)
> >>>>>>>>>
> >>>>>>>>>> Best wishes
> >>>>>>>>>> Emily Deng
> >>>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>>> -----Original Message-----
> >>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
> >>> Behalf
> >>>>>>>>>>> Of Deng, Emily
> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
> >>>>>>>>>>> gfx@lists.freedesktop.org
> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
> >>>>>>>>>>> for tdr
> >>>>>>>>>>>
> >>>>>>>>>>>> -----Original Message-----
> >>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
> >>>>>>>>>>>> amd-gfx@lists.freedesktop.org
> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
> >>>>>>>>>>>> for tdr
> >>>>>>>>>>>>
> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
> >>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
> >>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
> >>>>>>>>>>>> See drm_sched_job_cleanup().
> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
> >>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
> >>>>>>>>>>> go to free
> >>>>> job.
> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
> >>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
> >>>>>>>>> No, that case can't happen. See here:
> >>>>>>>>>
> >>>>>>>>>>            drm_sched_job_cleanup(s_job);
> >>>>>>>>>>
> >>>>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
> >>>>>>>>>>            dma_fence_put(job->fence);
> >>>>>>>>>>            amdgpu_sync_free(&job->sync);
> >>>>>>>>>>            amdgpu_sync_free(&job->sched_sync);
> >>>>>>>>>>            kfree(job);
> >>>>>>>>> The job itself is freed up directly after freeing the reference
> >>>>>>>>> to the
> >>>>> s_fence.
> >>>>>>>>> So you are just papering over a much bigger problem here. This
> >>>>>>>>> patch is a clear NAK.
> >>>>>>>>>
> >>>>>>>>> Regards,
> >>>>>>>>> Christian.
> >>>>>>>>>
> >>>>>>>>>>>> When you see a job without an s_fence then that means the
> >>>>>>>>>>>> problem is somewhere else.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Regards,
> >>>>>>>>>>>> Christian.
> >>>>>>>>>>>>
> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
> >>>>>>>>>>>>> ---
> >>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
> >>>>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
> >--
> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
> >>> amdgpu_device_gpu_recover(struct
> >>>>>>>>>>>> amdgpu_device *adev,
> >>>>>>>>>>>>>            *
> >>>>>>>>>>>>>            * job->base holds a reference to parent fence
> >>>>>>>>>>>>>            */
> >>>>>>>>>>>>> - if (job && job->base.s_fence->parent &&
> >>>>>>>>>>>>> + if (job && job->base.s_fence &&
> >>>>>>>>>>>>> + job->base.s_fence->parent
> >>>>>>> &&
> >>>>>>>>>>>>>               dma_fence_is_signaled(job->base.s_fence->parent))
> >>>>>>>>>>>>>                   job_signaled = true;
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
> >drm_sched_increase_karma(struct
> >>>>>>>>>>>> drm_sched_job
> >>>>>>>>>>>>> *bad)
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>                           spin_lock(&rq->lock);
> >>>>>>>>>>>>>                           list_for_each_entry_safe(entity,
> >>>>>>>>>>>>> tmp,
> >>> &rq-
> >>>>>>>> entities,
> >>>>>>>>>>>> list) {
> >>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
> >>>>>>> ==
> >>>>>>>>>>>>> -                              entity->fence_context) {
> >>>>>>>>>>>>> +                          if (bad->s_fence &&
> >>>>>>>>>>>>> + (bad->s_fence-
> >>>>>>>>>>>>> scheduled.context ==
> >>>>>>>>>>>>> +                              entity->fence_context)) {
> >>>>>>>>>>>>>                                           if
> >>>>>>>>>>>>> (atomic_read(&bad-
> >>>>>>>> karma) >
> >>>>>>>>>>>>>                                               bad->sched-
> >>>> hang_limit)
> >>>>>>>>>>>>>                                                   if
> >>>>>>>>>>>>> (entity-
> >>>> guilty) @@ -376,7 +376,7 @@ void
> >>>>>>>>>>>>> drm_sched_stop(struct
> >>>>>>> drm_gpu_scheduler
> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
> >>>>>>>>>>>>>            * This iteration is thread safe as sched thread
> >>>>>>>>>>>>> is
> >>> stopped.
> >>>>>>>>>>>>>            */
> >>>>>>>>>>>>>           list_for_each_entry_safe_reverse(s_job, tmp,
> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
> >>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
> >>>>>>>>>>>>>                       dma_fence_remove_callback(s_job-
> >>>> s_fence-
> >>>>>>>> parent,
> >>>>>>>>>>>>>                                                 &s_job->cb)) {
> >>>>>>>>>>>>>                           atomic_dec(&sched->hw_rq_count);
> >>> @@ -
> >>>>>>> 395,7
> >>>>>>>>>>> +395,8 @@ void
> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
> >>>>>>>>>>>>>                            *
> >>>>>>>>>>>>>                            * Job is still alive so fence
> >>>>>>>>>>>>> refcount at
> >>> least 1
> >>>>>>>>>>>>>                            */
> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
> >>>>>>> false);
> >>>>>>>>>>>>> +                  if (s_job->s_fence)
> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
> >>>>>>>> finished,
> >>>>>>>>>>>> false);
> >>>>>>>>>>>>>                           /*
> >>>>>>>>>>>>>                            * We must keep bad job alive
> >>>>>>>>>>>>> for later
> >>> use
> >>>>>>> during @@
> >>>>>>>>>>>> -438,7
> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
> >>>>> *sched,
> >>>>>>>>>>>>> +bool
> >>>>>>>>>>>> full_recovery)
> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
> >>>>>>>>>>>>>            */
> >>>>>>>>>>>>>           list_for_each_entry_safe(s_job, tmp,
> >>>>>>>>>>>>> &sched->ring_mirror_list,
> >>>>>>>>>>>>> node)
> >>>>>>>>>>>> {
> >>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
> >>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ?
> >>>>>>>>>>>>> + s_job-
> >>>>>>>> s_fence-
> >>>>>>>>>>>>> parent :
> >>>>>>>>>>>>> +NULL;
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>                   atomic_inc(&sched->hw_rq_count);
> >>>>>>>>>>>>>
> >>>>>>>>>>> _______________________________________________
> >>>>>>>>>>> amd-gfx mailing list
> >>>>>>>>>>> amd-gfx@lists.freedesktop.org
> >>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
> >
> >_______________________________________________
> >amd-gfx mailing list
> >amd-gfx@lists.freedesktop.org
> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[-- Attachment #1.2: Type: text/html, Size: 51362 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-11 21:35                                                                 ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-11 21:35 UTC (permalink / raw)
  To: Deng, Emily, Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Emily - is there a particular scenario to reproduce this ? I am trying 
with libdrm deadlock test and artificially delaying the GPU reset logic 
until after the guilty job is signaling but indeed nothing bad happens 
as drm_sched_cleanup_jobs returns early because there is a reset in 
progress and so the bad job is not getting released while GPU reset is 
running.

Can you provide event tracing for timer, dma_fence and gpu_scheduler for 
when the problem happens ?

Andrey

On 11/11/19 4:05 AM, Deng, Emily wrote:
> Hi Christian and Andrey,
>       The issue I encountered is the bad job is freeing after entering to the amdgpu_device_gpu_recover. Don't know why, as per Christian said, it will call cancel_delayed_work in drm_sched_cleanup_jobs.
>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Deng,
>> Emily
>> Sent: Monday, November 11, 2019 3:19 PM
>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Koenig, Christian
>> <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>> Hi Andrey,
>>     I don’t think your patch will help for this. As it will may call
>> kthread_should_park in drm_sched_cleanup_jobs first, and then call
>> kcl_kthread_park. And then it still has a race between the 2 threads.
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>> Sent: Saturday, November 9, 2019 3:01 AM
>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>>
>>> On 11/8/19 5:35 AM, Koenig, Christian wrote:
>>>> Hi Emily,
>>>>
>>>> exactly that can't happen. See here:
>>>>
>>>>>           /* Don't destroy jobs while the timeout worker is running
>>>>> */
>>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>               !cancel_delayed_work(&sched->work_tdr))
>>>>>                   return NULL;
>>>> We never free jobs while the timeout working is running to prevent
>>>> exactly that issue.
>>>
>>> I don't think this protects us if drm_sched_cleanup_jobs is called for
>>> scheduler which didn't experience a timeout, in
>>> amdgpu_device_gpu_recover we access
>>> sched->ring_mirror_list for all the schedulers on a device so this
>>> sched->condition
>>> above won't protect us. What in fact could help maybe is my recent
>>> patch
>>> 541c521 drm/sched: Avoid job cleanup if sched thread is parked. because
>>> we do park each of the scheduler threads during tdr job before trying
>>> to access
>>> sched->ring_mirror_list.
>>>
>>> Emily - did you see this problem with that patch in place ? I only
>>> pushed it yesterday.
>>>
>>> Andrey
>>>
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>> Hi Christian,
>>>>>         The drm_sched_job_timedout-> amdgpu_job_timedout call
>>> amdgpu_device_gpu_recover. I mean the main scheduler free the jobs
>>> while in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>>> Best wishes
>>>>> Emily Deng
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>> gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>
>>>>>> Hi Emily,
>>>>>>
>>>>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>>
>>>>>> When it's not the scheduler we shouldn't have a guilty job in the first
>> place.
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>> Hi Chrisitan,
>>>>>>>          No, I am with the new branch and also has the patch. Even
>>>>>>> it are freed by
>>>>>> main scheduler, how we could avoid main scheduler to free jobs
>>>>>> while enter to function amdgpu_device_gpu_recover?
>>>>>>> Best wishes
>>>>>>> Emily Deng
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>> tdr
>>>>>>>>
>>>>>>>> Hi Emily,
>>>>>>>>
>>>>>>>> in this case you are on an old code branch.
>>>>>>>>
>>>>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>>>>> timeout handler is running.
>>>>>>>>
>>>>>>>> See this patch here:
>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>
>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>> Hi Christian,
>>>>>>>>>           Please refer to follow log, when it enter to
>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>> function, the bad job 000000005086879e is freeing in function
>>>>>>>> amdgpu_job_free_cb  at the same time, because of the hardware
>>>>>>>> fence
>>>>>> signal.
>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>>>> s_fence is already freed, but job is not freed in time. Then this
>>>>>>>> issue
>>> occurs.
>>>>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>>>> sdma0
>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process
>> information:
>>>>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>> 449.794163] amdgpu
>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>> 449.794221] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>> process pid 0 thread pid 0, s_job:0000000066eb74ab [
>>>>>>>>> 449.794222] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>> process  pid 0 thread pid 0, s_job:00000000d4438ad9 [
>>>>>>>>> 449.794255] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>> process  pid 0 thread pid 0, s_job:00000000b6d69c65 [
>>>>>>>>> 449.794257] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>> process  pid 0 thread pid 0,
>>>>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6
>>>>>>>> [ 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>> dereference at
>>>>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G
>> OE
>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>>>> 449.803488]
>>>>>> RIP:
>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff
>>>>>>>>> ff ff
>>>>>>>>> 45 85 e4 0f
>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>>>>> <48> 8b
>>>>>> 98
>>>>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000
>> [
>>>>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES:
>>>>>>>>> 0000
>>> CR0:
>>>>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>> 0000000000000000
>>> [
>>>>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>>>>
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>>> tdr
>>>>>>>>>>
>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>
>>>>>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>>
>>>>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>> Best wishes
>>>>>>>>>>> Emily Deng
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>
>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>>>>
>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>> Behalf
>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is
>> destroyed.
>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in
>>>>>>>>>>>>>> one case, when it enter into the amdgpu_device_gpu_recover,
>>>>>>>>>>>>>> it already in drm_sched_job_cleanup, and at this time, it
>>>>>>>>>>>>>> will go to free
>>>>>>>> job.
>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>
>>>>>>>>>>>>>               drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>
>>>>>>>>>>>>>               amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>>               dma_fence_put(job->fence);
>>>>>>>>>>>>>               amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>               amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>               kfree(job);
>>>>>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>>>>> reference to the
>>>>>>>> s_fence.
>>>>>>>>>>>> So you are just papering over a much bigger problem here.
>>>>>>>>>>>> This patch is a clear NAK.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>          drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>>>>>          drivers/gpu/drm/scheduler/sched_main.c     | 11
>> ++++++-
>>> ----
>>>>>>>>>>>>>>>>          2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>          	 *
>>>>>>>>>>>>>>>>          	 * job->base holds a reference to parent fence
>>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>> +	if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>> +job->base.s_fence->parent
>>>>>>>>>> &&
>>>>>>>>>>>>>>>>          	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>>>>>          		job_signaled = true;
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>          			spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>          			list_for_each_entry_safe(entity, tmp,
>>>>>> &rq-
>>>>>>>>>>> entities,
>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>> -				if (bad->s_fence-
>>>> scheduled.context
>>>>>>>>>> ==
>>>>>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>>>>>> +				if (bad->s_fence && (bad-
>>>> s_fence-
>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>>>>>          					if (atomic_read(&bad-
>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>          					    bad->sched-
>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>          						if (entity-
>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>          	 * This iteration is thread safe as sched thread
>>>>>>>>>>>>>>>> is
>>>>>> stopped.
>>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>>>          	list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent
>>> &&
>>>>>>>>>>>>>>>>          		    dma_fence_remove_callback(s_job-
>>>>>>> s_fence-
>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>          					      &s_job->cb)) {
>>>>>>>>>>>>>>>>          			atomic_dec(&sched->hw_rq_count);
>>>>>> @@ -
>>>>>>>>>> 395,7
>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>          			 *
>>>>>>>>>>>>>>>>          			 * Job is still alive so fence refcount at
>>>>>> least 1
>>>>>>>>>>>>>>>>          			 */
>>>>>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence-
>>>> finished,
>>>>>>>>>> false);
>>>>>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>>>>>> +				dma_fence_wait(&s_job-
>>>> s_fence-
>>>>>>>>>>> finished,
>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>          			/*
>>>>>>>>>>>>>>>>          			 * We must keep bad job alive for later
>>>>>> use
>>>>>>>>>> during @@
>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>> drm_gpu_scheduler
>>>>>>>> *sched,
>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>          	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>>>          	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence-
>>>> parent;
>>>>>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ?
>>> s_job-
>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>          		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-11 21:35                                                                 ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-11 21:35 UTC (permalink / raw)
  To: Deng, Emily, Koenig, Christian, amd-gfx

Emily - is there a particular scenario to reproduce this ? I am trying 
with libdrm deadlock test and artificially delaying the GPU reset logic 
until after the guilty job is signaling but indeed nothing bad happens 
as drm_sched_cleanup_jobs returns early because there is a reset in 
progress and so the bad job is not getting released while GPU reset is 
running.

Can you provide event tracing for timer, dma_fence and gpu_scheduler for 
when the problem happens ?

Andrey

On 11/11/19 4:05 AM, Deng, Emily wrote:
> Hi Christian and Andrey,
>       The issue I encountered is the bad job is freeing after entering to the amdgpu_device_gpu_recover. Don't know why, as per Christian said, it will call cancel_delayed_work in drm_sched_cleanup_jobs.
>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Deng,
>> Emily
>> Sent: Monday, November 11, 2019 3:19 PM
>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Koenig, Christian
>> <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>> Hi Andrey,
>>     I don’t think your patch will help for this. As it will may call
>> kthread_should_park in drm_sched_cleanup_jobs first, and then call
>> kcl_kthread_park. And then it still has a race between the 2 threads.
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>> Sent: Saturday, November 9, 2019 3:01 AM
>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>>
>>> On 11/8/19 5:35 AM, Koenig, Christian wrote:
>>>> Hi Emily,
>>>>
>>>> exactly that can't happen. See here:
>>>>
>>>>>           /* Don't destroy jobs while the timeout worker is running
>>>>> */
>>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>               !cancel_delayed_work(&sched->work_tdr))
>>>>>                   return NULL;
>>>> We never free jobs while the timeout working is running to prevent
>>>> exactly that issue.
>>>
>>> I don't think this protects us if drm_sched_cleanup_jobs is called for
>>> scheduler which didn't experience a timeout, in
>>> amdgpu_device_gpu_recover we access
>>> sched->ring_mirror_list for all the schedulers on a device so this
>>> sched->condition
>>> above won't protect us. What in fact could help maybe is my recent
>>> patch
>>> 541c521 drm/sched: Avoid job cleanup if sched thread is parked. because
>>> we do park each of the scheduler threads during tdr job before trying
>>> to access
>>> sched->ring_mirror_list.
>>>
>>> Emily - did you see this problem with that patch in place ? I only
>>> pushed it yesterday.
>>>
>>> Andrey
>>>
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>> Hi Christian,
>>>>>         The drm_sched_job_timedout-> amdgpu_job_timedout call
>>> amdgpu_device_gpu_recover. I mean the main scheduler free the jobs
>>> while in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>>> Best wishes
>>>>> Emily Deng
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>> gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>
>>>>>> Hi Emily,
>>>>>>
>>>>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>>
>>>>>> When it's not the scheduler we shouldn't have a guilty job in the first
>> place.
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>> Hi Chrisitan,
>>>>>>>          No, I am with the new branch and also has the patch. Even
>>>>>>> it are freed by
>>>>>> main scheduler, how we could avoid main scheduler to free jobs
>>>>>> while enter to function amdgpu_device_gpu_recover?
>>>>>>> Best wishes
>>>>>>> Emily Deng
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>> tdr
>>>>>>>>
>>>>>>>> Hi Emily,
>>>>>>>>
>>>>>>>> in this case you are on an old code branch.
>>>>>>>>
>>>>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>>>>> timeout handler is running.
>>>>>>>>
>>>>>>>> See this patch here:
>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>
>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>> Hi Christian,
>>>>>>>>>           Please refer to follow log, when it enter to
>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>> function, the bad job 000000005086879e is freeing in function
>>>>>>>> amdgpu_job_free_cb  at the same time, because of the hardware
>>>>>>>> fence
>>>>>> signal.
>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>>>> s_fence is already freed, but job is not freed in time. Then this
>>>>>>>> issue
>>> occurs.
>>>>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>>>> sdma0
>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process
>> information:
>>>>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>> 449.794163] amdgpu
>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>> 449.794221] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>> process pid 0 thread pid 0, s_job:0000000066eb74ab [
>>>>>>>>> 449.794222] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>> process  pid 0 thread pid 0, s_job:00000000d4438ad9 [
>>>>>>>>> 449.794255] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>> process  pid 0 thread pid 0, s_job:00000000b6d69c65 [
>>>>>>>>> 449.794257] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>> process  pid 0 thread pid 0,
>>>>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6
>>>>>>>> [ 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>> dereference at
>>>>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G
>> OE
>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>>>> 449.803488]
>>>>>> RIP:
>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff
>>>>>>>>> ff ff
>>>>>>>>> 45 85 e4 0f
>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>>>>> <48> 8b
>>>>>> 98
>>>>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000
>> [
>>>>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES:
>>>>>>>>> 0000
>>> CR0:
>>>>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>> 0000000000000000
>>> [
>>>>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>>>>
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>>> tdr
>>>>>>>>>>
>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>
>>>>>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>>
>>>>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>>> Best wishes
>>>>>>>>>>> Emily Deng
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>
>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>>>>
>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>> Behalf
>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is
>> destroyed.
>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in
>>>>>>>>>>>>>> one case, when it enter into the amdgpu_device_gpu_recover,
>>>>>>>>>>>>>> it already in drm_sched_job_cleanup, and at this time, it
>>>>>>>>>>>>>> will go to free
>>>>>>>> job.
>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>
>>>>>>>>>>>>>               drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>
>>>>>>>>>>>>>               amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>>               dma_fence_put(job->fence);
>>>>>>>>>>>>>               amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>               amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>               kfree(job);
>>>>>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>>>>> reference to the
>>>>>>>> s_fence.
>>>>>>>>>>>> So you are just papering over a much bigger problem here.
>>>>>>>>>>>> This patch is a clear NAK.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>          drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>>>>>          drivers/gpu/drm/scheduler/sched_main.c     | 11
>> ++++++-
>>> ----
>>>>>>>>>>>>>>>>          2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>          	 *
>>>>>>>>>>>>>>>>          	 * job->base holds a reference to parent fence
>>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>> +	if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>> +job->base.s_fence->parent
>>>>>>>>>> &&
>>>>>>>>>>>>>>>>          	    dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>>>>>          		job_signaled = true;
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>          			spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>          			list_for_each_entry_safe(entity, tmp,
>>>>>> &rq-
>>>>>>>>>>> entities,
>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>> -				if (bad->s_fence-
>>>> scheduled.context
>>>>>>>>>> ==
>>>>>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>>>>>> +				if (bad->s_fence && (bad-
>>>> s_fence-
>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>>>>>          					if (atomic_read(&bad-
>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>          					    bad->sched-
>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>          						if (entity-
>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>          	 * This iteration is thread safe as sched thread
>>>>>>>>>>>>>>>> is
>>>>>> stopped.
>>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>>>          	list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent
>>> &&
>>>>>>>>>>>>>>>>          		    dma_fence_remove_callback(s_job-
>>>>>>> s_fence-
>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>          					      &s_job->cb)) {
>>>>>>>>>>>>>>>>          			atomic_dec(&sched->hw_rq_count);
>>>>>> @@ -
>>>>>>>>>> 395,7
>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>          			 *
>>>>>>>>>>>>>>>>          			 * Job is still alive so fence refcount at
>>>>>> least 1
>>>>>>>>>>>>>>>>          			 */
>>>>>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence-
>>>> finished,
>>>>>>>>>> false);
>>>>>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>>>>>> +				dma_fence_wait(&s_job-
>>>> s_fence-
>>>>>>>>>>> finished,
>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>          			/*
>>>>>>>>>>>>>>>>          			 * We must keep bad job alive for later
>>>>>> use
>>>>>>>>>> during @@
>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>> drm_gpu_scheduler
>>>>>>>> *sched,
>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>          	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>>>          	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence-
>>>> parent;
>>>>>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ?
>>> s_job-
>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>          		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-12  3:28                                                     ` Grodzovsky, Andrey
  0 siblings, 0 replies; 80+ messages in thread
From: Grodzovsky, Andrey @ 2019-11-12  3:28 UTC (permalink / raw)
  To: Koenig, Christian, Deng, Emily, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Thinking more about this claim - we assume here that if cancel_delayed_work returned true it guarantees that timeout work is not running but, it merely means there was a pending timeout work which was removed from the workqueue before it's timer elapsed and so it didn't have a chance to be dequeued and executed, it doesn't cover already executing work. So there is a possibility where while timeout work started executing another timeout work already got enqueued (maybe through earlier cleanup jobs or through drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs cancel_delayed_work(&sched->work_tdr) will return true even while there is a timeout job in progress.
Unfortunately we cannot change cancel_delayed_work to cancel_delayed_work_sync to flush the timeout work as timeout work itself waits for schedule thread  to be parked again when calling park_thread.

Andrey

________________________________________
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Koenig, Christian <Christian.Koenig@amd.com>
Sent: 08 November 2019 05:35:18
To: Deng, Emily; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr

Hi Emily,

exactly that can't happen. See here:

>         /* Don't destroy jobs while the timeout worker is running */
>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>             !cancel_delayed_work(&sched->work_tdr))
>                 return NULL;

We never free jobs while the timeout working is running to prevent
exactly that issue.

Regards,
Christian.

Am 08.11.19 um 11:32 schrieb Deng, Emily:
> Hi Christian,
>       The drm_sched_job_timedout-> amdgpu_job_timedout call amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Friday, November 8, 2019 6:26 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>> Hi Emily,
>>
>> well who is calling amdgpu_device_gpu_recover() in this case?
>>
>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>
>> Regards,
>> Christian.
>>
>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>> Hi Chrisitan,
>>>        No, I am with the new branch and also has the patch. Even it are freed by
>> main scheduler, how we could avoid main scheduler to free jobs while enter
>> to function amdgpu_device_gpu_recover?
>>> Best wishes
>>> Emily Deng
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>> Hi Emily,
>>>>
>>>> in this case you are on an old code branch.
>>>>
>>>> Jobs are freed now by the main scheduler thread and only if no
>>>> timeout handler is running.
>>>>
>>>> See this patch here:
>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>
>>>>>       drm/scheduler: rework job destruction
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>> Hi Christian,
>>>>>         Please refer to follow log, when it enter to
>>>>> amdgpu_device_gpu_recover
>>>> function, the bad job 000000005086879e is freeing in function
>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>> signal.
>>>> But amdgpu_device_gpu_recover goes faster, at this case, the s_fence
>>>> is already freed, but job is not freed in time. Then this issue occurs.
>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>> sdma0
>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>> amdgpu
>>>> 0000:00:08.0: GPU reset begin!
>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information: process
>>>>> pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>> pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>> pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>> pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>> pid 0,
>>>> s_job:00000000ea85e922 [  449.794287]
>>>> Emily:amdgpu_job_free_cb,Process
>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference at
>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops: 0000
>>>> [#1] SMP PTI
>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [  449.803488]
>> RIP:
>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff ff
>>>>> 45 85 e4 0f
>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10 <48> 8b
>> 98
>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000 CR0:
>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>> gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>
>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>> Sorry, please take your time.
>>>>>> Have you seen my other response a bit below?
>>>>>>
>>>>>> I can't follow how it would be possible for job->s_fence to be NULL
>>>>>> without the job also being freed.
>>>>>>
>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>> Best wishes
>>>>>>> Emily Deng
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>> gfx@lists.freedesktop.org
>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>> tdr
>>>>>>>>
>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>> Ping.....
>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>
>>>>>>>>> Best wishes
>>>>>>>>> Emily Deng
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>> Behalf
>>>>>>>>>> Of Deng, Emily
>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>>> tdr
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>
>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>> When the job is already signaled, the s_fence is freed. Then
>>>>>>>>>>>> it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will go
>>>>>>>>>> to free
>>>> job.
>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At that
>>>>>>>>>> time, job is not freed, but s_fence is already NULL.
>>>>>>>> No, that case can't happen. See here:
>>>>>>>>
>>>>>>>>>             drm_sched_job_cleanup(s_job);
>>>>>>>>>
>>>>>>>>>             amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>             dma_fence_put(job->fence);
>>>>>>>>>             amdgpu_sync_free(&job->sync);
>>>>>>>>>             amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>             kfree(job);
>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>>>> to the
>>>> s_fence.
>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>>> patch is a clear NAK.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>> ---
>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>        drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>>>>>>>        2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>             *
>>>>>>>>>>>>             * job->base holds a reference to parent fence
>>>>>>>>>>>>             */
>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>> +  if (job && job->base.s_fence && job->base.s_fence->parent
>>>>>> &&
>>>>>>>>>>>>                dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>                    job_signaled = true;
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>> *bad)
>>>>>>>>>>>>
>>>>>>>>>>>>                            spin_lock(&rq->lock);
>>>>>>>>>>>>                            list_for_each_entry_safe(entity, tmp,
>> &rq-
>>>>>>> entities,
>>>>>>>>>>> list) {
>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
>>>>>> ==
>>>>>>>>>>>> -                              entity->fence_context) {
>>>>>>>>>>>> +                          if (bad->s_fence && (bad->s_fence-
>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>> +                              entity->fence_context)) {
>>>>>>>>>>>>                                            if (atomic_read(&bad-
>>>>>>> karma) >
>>>>>>>>>>>>                                                bad->sched-
>>> hang_limit)
>>>>>>>>>>>>                                                    if (entity-
>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>> drm_sched_stop(struct
>>>>>> drm_gpu_scheduler
>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>             * This iteration is thread safe as sched thread is
>> stopped.
>>>>>>>>>>>>             */
>>>>>>>>>>>>            list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>>>>>>>> ring_mirror_list, node) {
>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>>>>>>                        dma_fence_remove_callback(s_job-
>>> s_fence-
>>>>>>> parent,
>>>>>>>>>>>>                                                  &s_job->cb)) {
>>>>>>>>>>>>                            atomic_dec(&sched->hw_rq_count);
>> @@ -
>>>>>> 395,7
>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>                             *
>>>>>>>>>>>>                             * Job is still alive so fence refcount at
>> least 1
>>>>>>>>>>>>                             */
>>>>>>>>>>>> -                  dma_fence_wait(&s_job->s_fence->finished,
>>>>>> false);
>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>>>>>>>>> +                          dma_fence_wait(&s_job->s_fence-
>>>>>>> finished,
>>>>>>>>>>> false);
>>>>>>>>>>>>                            /*
>>>>>>>>>>>>                             * We must keep bad job alive for later
>> use
>>>>>> during @@
>>>>>>>>>>> -438,7
>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>> *sched,
>>>>>>>>>>>> +bool
>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>             * GPU recovers can't run in parallel.
>>>>>>>>>>>>             */
>>>>>>>>>>>>            list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>> node)
>>>>>>>>>>> {
>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ? s_job-
>>>>>>> s_fence-
>>>>>>>>>>>> parent :
>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>
>>>>>>>>>>>>                    atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>
>>>>>>>>>> _______________________________________________
>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-12  3:28                                                     ` Grodzovsky, Andrey
  0 siblings, 0 replies; 80+ messages in thread
From: Grodzovsky, Andrey @ 2019-11-12  3:28 UTC (permalink / raw)
  To: Koenig, Christian, Deng, Emily, amd-gfx

Thinking more about this claim - we assume here that if cancel_delayed_work returned true it guarantees that timeout work is not running but, it merely means there was a pending timeout work which was removed from the workqueue before it's timer elapsed and so it didn't have a chance to be dequeued and executed, it doesn't cover already executing work. So there is a possibility where while timeout work started executing another timeout work already got enqueued (maybe through earlier cleanup jobs or through drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs cancel_delayed_work(&sched->work_tdr) will return true even while there is a timeout job in progress.
Unfortunately we cannot change cancel_delayed_work to cancel_delayed_work_sync to flush the timeout work as timeout work itself waits for schedule thread  to be parked again when calling park_thread.

Andrey

________________________________________
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Koenig, Christian <Christian.Koenig@amd.com>
Sent: 08 November 2019 05:35:18
To: Deng, Emily; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr

Hi Emily,

exactly that can't happen. See here:

>         /* Don't destroy jobs while the timeout worker is running */
>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>             !cancel_delayed_work(&sched->work_tdr))
>                 return NULL;

We never free jobs while the timeout working is running to prevent
exactly that issue.

Regards,
Christian.

Am 08.11.19 um 11:32 schrieb Deng, Emily:
> Hi Christian,
>       The drm_sched_job_timedout-> amdgpu_job_timedout call amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Friday, November 8, 2019 6:26 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>> Hi Emily,
>>
>> well who is calling amdgpu_device_gpu_recover() in this case?
>>
>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>
>> Regards,
>> Christian.
>>
>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>> Hi Chrisitan,
>>>        No, I am with the new branch and also has the patch. Even it are freed by
>> main scheduler, how we could avoid main scheduler to free jobs while enter
>> to function amdgpu_device_gpu_recover?
>>> Best wishes
>>> Emily Deng
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>> Hi Emily,
>>>>
>>>> in this case you are on an old code branch.
>>>>
>>>> Jobs are freed now by the main scheduler thread and only if no
>>>> timeout handler is running.
>>>>
>>>> See this patch here:
>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>
>>>>>       drm/scheduler: rework job destruction
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>> Hi Christian,
>>>>>         Please refer to follow log, when it enter to
>>>>> amdgpu_device_gpu_recover
>>>> function, the bad job 000000005086879e is freeing in function
>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>> signal.
>>>> But amdgpu_device_gpu_recover goes faster, at this case, the s_fence
>>>> is already freed, but job is not freed in time. Then this issue occurs.
>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>> sdma0
>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>> amdgpu
>>>> 0000:00:08.0: GPU reset begin!
>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information: process
>>>>> pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>> pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>> pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>> pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread
>>>>> pid 0,
>>>> s_job:00000000ea85e922 [  449.794287]
>>>> Emily:amdgpu_job_free_cb,Process
>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference at
>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops: 0000
>>>> [#1] SMP PTI
>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [  449.803488]
>> RIP:
>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff ff
>>>>> 45 85 e4 0f
>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10 <48> 8b
>> 98
>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000 CR0:
>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>> gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>
>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>> Sorry, please take your time.
>>>>>> Have you seen my other response a bit below?
>>>>>>
>>>>>> I can't follow how it would be possible for job->s_fence to be NULL
>>>>>> without the job also being freed.
>>>>>>
>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>> Best wishes
>>>>>>> Emily Deng
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>> gfx@lists.freedesktop.org
>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>> tdr
>>>>>>>>
>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>> Ping.....
>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>
>>>>>>>>> Best wishes
>>>>>>>>> Emily Deng
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>> Behalf
>>>>>>>>>> Of Deng, Emily
>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>>> tdr
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>
>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>> When the job is already signaled, the s_fence is freed. Then
>>>>>>>>>>>> it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will go
>>>>>>>>>> to free
>>>> job.
>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At that
>>>>>>>>>> time, job is not freed, but s_fence is already NULL.
>>>>>>>> No, that case can't happen. See here:
>>>>>>>>
>>>>>>>>>             drm_sched_job_cleanup(s_job);
>>>>>>>>>
>>>>>>>>>             amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>             dma_fence_put(job->fence);
>>>>>>>>>             amdgpu_sync_free(&job->sync);
>>>>>>>>>             amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>             kfree(job);
>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>>>> to the
>>>> s_fence.
>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>>> patch is a clear NAK.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>> ---
>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>        drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++-----
>>>>>>>>>>>>        2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>             *
>>>>>>>>>>>>             * job->base holds a reference to parent fence
>>>>>>>>>>>>             */
>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>> +  if (job && job->base.s_fence && job->base.s_fence->parent
>>>>>> &&
>>>>>>>>>>>>                dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>                    job_signaled = true;
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> @@ -334,8 +334,8 @@ void drm_sched_increase_karma(struct
>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>> *bad)
>>>>>>>>>>>>
>>>>>>>>>>>>                            spin_lock(&rq->lock);
>>>>>>>>>>>>                            list_for_each_entry_safe(entity, tmp,
>> &rq-
>>>>>>> entities,
>>>>>>>>>>> list) {
>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
>>>>>> ==
>>>>>>>>>>>> -                              entity->fence_context) {
>>>>>>>>>>>> +                          if (bad->s_fence && (bad->s_fence-
>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>> +                              entity->fence_context)) {
>>>>>>>>>>>>                                            if (atomic_read(&bad-
>>>>>>> karma) >
>>>>>>>>>>>>                                                bad->sched-
>>> hang_limit)
>>>>>>>>>>>>                                                    if (entity-
>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>> drm_sched_stop(struct
>>>>>> drm_gpu_scheduler
>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>             * This iteration is thread safe as sched thread is
>> stopped.
>>>>>>>>>>>>             */
>>>>>>>>>>>>            list_for_each_entry_safe_reverse(s_job, tmp, &sched-
>>>>>>>>>>>> ring_mirror_list, node) {
>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>>>>>>                        dma_fence_remove_callback(s_job-
>>> s_fence-
>>>>>>> parent,
>>>>>>>>>>>>                                                  &s_job->cb)) {
>>>>>>>>>>>>                            atomic_dec(&sched->hw_rq_count);
>> @@ -
>>>>>> 395,7
>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>                             *
>>>>>>>>>>>>                             * Job is still alive so fence refcount at
>> least 1
>>>>>>>>>>>>                             */
>>>>>>>>>>>> -                  dma_fence_wait(&s_job->s_fence->finished,
>>>>>> false);
>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>>>>>>>>> +                          dma_fence_wait(&s_job->s_fence-
>>>>>>> finished,
>>>>>>>>>>> false);
>>>>>>>>>>>>                            /*
>>>>>>>>>>>>                             * We must keep bad job alive for later
>> use
>>>>>> during @@
>>>>>>>>>>> -438,7
>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>> *sched,
>>>>>>>>>>>> +bool
>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>             * GPU recovers can't run in parallel.
>>>>>>>>>>>>             */
>>>>>>>>>>>>            list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>> node)
>>>>>>>>>>> {
>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ? s_job-
>>>>>>> s_fence-
>>>>>>>>>>>> parent :
>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>
>>>>>>>>>>>>                    atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>
>>>>>>>>>> _______________________________________________
>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-12  5:48                                                                     ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-12  5:48 UTC (permalink / raw)
  To: Grodzovsky, Andrey, Koenig, Christian,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Andrey,
     On my side, it doesn't need to a specific scenario, I only run the quark with slow job. Then sometimes, it will have fake hang and hardware fence will back. For this case, it will randomly occur the NULL pointer issue in amdgpu_device_gpu_recover.

>-----Original Message-----
>From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>Sent: Tuesday, November 12, 2019 5:35 AM
>To: Deng, Emily <Emily.Deng@amd.com>; Koenig, Christian
><Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Emily - is there a particular scenario to reproduce this ? I am trying with libdrm
>deadlock test and artificially delaying the GPU reset logic until after the guilty
>job is signaling but indeed nothing bad happens as drm_sched_cleanup_jobs
>returns early because there is a reset in progress and so the bad job is not
>getting released while GPU reset is running.
>
>Can you provide event tracing for timer, dma_fence and gpu_scheduler for
>when the problem happens ?
>
>Andrey
>
>On 11/11/19 4:05 AM, Deng, Emily wrote:
>> Hi Christian and Andrey,
>>       The issue I encountered is the bad job is freeing after entering to the
>amdgpu_device_gpu_recover. Don't know why, as per Christian said, it will
>call cancel_delayed_work in drm_sched_cleanup_jobs.
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>> Deng, Emily
>>> Sent: Monday, November 11, 2019 3:19 PM
>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Koenig,
>Christian
>>> <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Hi Andrey,
>>>     I don’t think your patch will help for this. As it will may call
>>> kthread_should_park in drm_sched_cleanup_jobs first, and then call
>>> kcl_kthread_park. And then it still has a race between the 2 threads.
>>>
>>> Best wishes
>>> Emily Deng
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>> Sent: Saturday, November 9, 2019 3:01 AM
>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>>
>>>> On 11/8/19 5:35 AM, Koenig, Christian wrote:
>>>>> Hi Emily,
>>>>>
>>>>> exactly that can't happen. See here:
>>>>>
>>>>>>           /* Don't destroy jobs while the timeout worker is
>>>>>> running */
>>>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>               !cancel_delayed_work(&sched->work_tdr))
>>>>>>                   return NULL;
>>>>> We never free jobs while the timeout working is running to prevent
>>>>> exactly that issue.
>>>>
>>>> I don't think this protects us if drm_sched_cleanup_jobs is called
>>>> for scheduler which didn't experience a timeout, in
>>>> amdgpu_device_gpu_recover we access
>>>> sched->ring_mirror_list for all the schedulers on a device so this
>>>> sched->condition
>>>> above won't protect us. What in fact could help maybe is my recent
>>>> patch
>>>> 541c521 drm/sched: Avoid job cleanup if sched thread is parked.
>>>> because we do park each of the scheduler threads during tdr job
>>>> before trying to access
>>>> sched->ring_mirror_list.
>>>>
>>>> Emily - did you see this problem with that patch in place ? I only
>>>> pushed it yesterday.
>>>>
>>>> Andrey
>>>>
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>> Hi Christian,
>>>>>>         The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>> amdgpu_device_gpu_recover. I mean the main scheduler free the jobs
>>>> while in amdgpu_device_gpu_recover, and before calling
>drm_sched_stop.
>>>>>> Best wishes
>>>>>> Emily Deng
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>> gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> tdr
>>>>>>>
>>>>>>> Hi Emily,
>>>>>>>
>>>>>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>>>
>>>>>>> When it's not the scheduler we shouldn't have a guilty job in the
>>>>>>> first
>>> place.
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>> Hi Chrisitan,
>>>>>>>>          No, I am with the new branch and also has the patch.
>>>>>>>> Even it are freed by
>>>>>>> main scheduler, how we could avoid main scheduler to free jobs
>>>>>>> while enter to function amdgpu_device_gpu_recover?
>>>>>>>> Best wishes
>>>>>>>> Emily Deng
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>> tdr
>>>>>>>>>
>>>>>>>>> Hi Emily,
>>>>>>>>>
>>>>>>>>> in this case you are on an old code branch.
>>>>>>>>>
>>>>>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>>>>>> timeout handler is running.
>>>>>>>>>
>>>>>>>>> See this patch here:
>>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>>
>>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>>> Hi Christian,
>>>>>>>>>>           Please refer to follow log, when it enter to
>>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>>> function, the bad job 000000005086879e is freeing in function
>>>>>>>>> amdgpu_job_free_cb  at the same time, because of the hardware
>>>>>>>>> fence
>>>>>>> signal.
>>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>>>>> s_fence is already freed, but job is not freed in time. Then
>>>>>>>>> this issue
>>>> occurs.
>>>>>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>ring
>>>>>>> sdma0
>>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process
>>> information:
>>>>>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>> 449.794163] amdgpu
>>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>>> 449.794221] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>> process pid 0 thread pid 0, s_job:0000000066eb74ab [
>>>>>>>>>> 449.794222] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>> process  pid 0 thread pid 0, s_job:00000000d4438ad9 [
>>>>>>>>>> 449.794255] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>> process  pid 0 thread pid 0, s_job:00000000b6d69c65 [
>>>>>>>>>> 449.794257] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>> process  pid 0 thread pid 0,
>>>>>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>> information: process  pid 0 thread  pid 0,
>>>>>>>>> s_job:00000000ed3a5ac6 [ 449.794366] BUG: unable to handle
>>>>>>>>> kernel NULL pointer dereference at
>>>>>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040]
>Oops:
>>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G
>>> OE
>>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>>>>> 449.803488]
>>>>>>> RIP:
>>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff
>>>>>>>>>> ff ff
>>>>>>>>>> 45 85 e4 0f
>>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40
>>>>>>>>> 10 <48> 8b
>>>>>>> 98
>>>>>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000
>RCX:
>>>>>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>0000000000000000
>>> [
>>>>>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148
>R12:
>>>>>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES:
>>>>>>>>>> 0000
>>>> CR0:
>>>>>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>> 0000000000000000
>>>> [
>>>>>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu]
>[
>>>>>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched]
>[
>>>>>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140
>[
>>>>>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>
>>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>>
>>>>>>>>>>> I can't follow how it would be possible for job->s_fence to
>>>>>>>>>>> be NULL without the job also being freed.
>>>>>>>>>>>
>>>>>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>> Best wishes
>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>
>>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>>>>>
>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org>
>On
>>>>>>> Behalf
>>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>> From: Christian König
><ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>>>>>>>>>>>> Then it will has null pointer in
>amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is
>>> destroyed.
>>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in
>>>>>>>>>>>>>>> one case, when it enter into the
>>>>>>>>>>>>>>> amdgpu_device_gpu_recover, it already in
>>>>>>>>>>>>>>> drm_sched_job_cleanup, and at this time, it will go to
>>>>>>>>>>>>>>> free
>>>>>>>>> job.
>>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster.
>At
>>>>>>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>>
>>>>>>>>>>>>>>               drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>               amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>>>               dma_fence_put(job->fence);
>>>>>>>>>>>>>>               amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>>               amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>>               kfree(job);
>>>>>>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>>>>>> reference to the
>>>>>>>>> s_fence.
>>>>>>>>>>>>> So you are just papering over a much bigger problem here.
>>>>>>>>>>>>> This patch is a clear NAK.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> When you see a job without an s_fence then that means
>>>>>>>>>>>>>>>> the problem is somewhere else.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>          drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |
>2 +-
>>>>>>>>>>>>>>>>>          drivers/gpu/drm/scheduler/sched_main.c     | 11
>>> ++++++-
>>>> ----
>>>>>>>>>>>>>>>>>          2 files changed, 7 insertions(+), 6
>>>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> diff --git
>a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>          	 *
>>>>>>>>>>>>>>>>>          	 * job->base holds a reference to parent fence
>>>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>>> +	if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>>> +job->base.s_fence->parent
>>>>>>>>>>> &&
>>>>>>>>>>>>>>>>>          	    dma_fence_is_signaled(job->base.s_fence-
>>parent))
>>>>>>>>>>>>>>>>>          		job_signaled = true;
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>          			spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>>
>	list_for_each_entry_safe(entity, tmp,
>>>>>>> &rq-
>>>>>>>>>>>> entities,
>>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>>> -				if (bad->s_fence-
>>>>> scheduled.context
>>>>>>>>>>> ==
>>>>>>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>>>>>>> +				if (bad->s_fence && (bad-
>>>>> s_fence-
>>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>>>>>>          					if
>(atomic_read(&bad-
>>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>>          					    bad->sched-
>>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>>          						if
>(entity-
>>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>          	 * This iteration is thread safe as sched
>>>>>>>>>>>>>>>>> thread is
>>>>>>> stopped.
>>>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>>>>          	list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent
>>>> &&
>>>>>>>>>>>>>>>>>          		    dma_fence_remove_callback(s_job-
>>>>>>>> s_fence-
>>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>>          					      &s_job-
>>cb)) {
>>>>>>>>>>>>>>>>>          			atomic_dec(&sched-
>>hw_rq_count);
>>>>>>> @@ -
>>>>>>>>>>> 395,7
>>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>          			 *
>>>>>>>>>>>>>>>>>          			 * Job is still alive so fence
>refcount at
>>>>>>> least 1
>>>>>>>>>>>>>>>>>          			 */
>>>>>>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence-
>>>>> finished,
>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>>>>>>> +				dma_fence_wait(&s_job-
>>>>> s_fence-
>>>>>>>>>>>> finished,
>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>          			/*
>>>>>>>>>>>>>>>>>          			 * We must keep bad job alive
>for later
>>>>>>> use
>>>>>>>>>>> during @@
>>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>> drm_gpu_scheduler
>>>>>>>>> *sched,
>>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>>          	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>>>>          	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence-
>>>>> parent;
>>>>>>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ?
>>>> s_job-
>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>          		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-12  5:48                                                                     ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-12  5:48 UTC (permalink / raw)
  To: Grodzovsky, Andrey, Koenig, Christian, amd-gfx

Hi Andrey,
     On my side, it doesn't need to a specific scenario, I only run the quark with slow job. Then sometimes, it will have fake hang and hardware fence will back. For this case, it will randomly occur the NULL pointer issue in amdgpu_device_gpu_recover.

>-----Original Message-----
>From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>Sent: Tuesday, November 12, 2019 5:35 AM
>To: Deng, Emily <Emily.Deng@amd.com>; Koenig, Christian
><Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Emily - is there a particular scenario to reproduce this ? I am trying with libdrm
>deadlock test and artificially delaying the GPU reset logic until after the guilty
>job is signaling but indeed nothing bad happens as drm_sched_cleanup_jobs
>returns early because there is a reset in progress and so the bad job is not
>getting released while GPU reset is running.
>
>Can you provide event tracing for timer, dma_fence and gpu_scheduler for
>when the problem happens ?
>
>Andrey
>
>On 11/11/19 4:05 AM, Deng, Emily wrote:
>> Hi Christian and Andrey,
>>       The issue I encountered is the bad job is freeing after entering to the
>amdgpu_device_gpu_recover. Don't know why, as per Christian said, it will
>call cancel_delayed_work in drm_sched_cleanup_jobs.
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>> Deng, Emily
>>> Sent: Monday, November 11, 2019 3:19 PM
>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Koenig,
>Christian
>>> <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Hi Andrey,
>>>     I don’t think your patch will help for this. As it will may call
>>> kthread_should_park in drm_sched_cleanup_jobs first, and then call
>>> kcl_kthread_park. And then it still has a race between the 2 threads.
>>>
>>> Best wishes
>>> Emily Deng
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>> Sent: Saturday, November 9, 2019 3:01 AM
>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>>
>>>> On 11/8/19 5:35 AM, Koenig, Christian wrote:
>>>>> Hi Emily,
>>>>>
>>>>> exactly that can't happen. See here:
>>>>>
>>>>>>           /* Don't destroy jobs while the timeout worker is
>>>>>> running */
>>>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>               !cancel_delayed_work(&sched->work_tdr))
>>>>>>                   return NULL;
>>>>> We never free jobs while the timeout working is running to prevent
>>>>> exactly that issue.
>>>>
>>>> I don't think this protects us if drm_sched_cleanup_jobs is called
>>>> for scheduler which didn't experience a timeout, in
>>>> amdgpu_device_gpu_recover we access
>>>> sched->ring_mirror_list for all the schedulers on a device so this
>>>> sched->condition
>>>> above won't protect us. What in fact could help maybe is my recent
>>>> patch
>>>> 541c521 drm/sched: Avoid job cleanup if sched thread is parked.
>>>> because we do park each of the scheduler threads during tdr job
>>>> before trying to access
>>>> sched->ring_mirror_list.
>>>>
>>>> Emily - did you see this problem with that patch in place ? I only
>>>> pushed it yesterday.
>>>>
>>>> Andrey
>>>>
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>> Hi Christian,
>>>>>>         The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>> amdgpu_device_gpu_recover. I mean the main scheduler free the jobs
>>>> while in amdgpu_device_gpu_recover, and before calling
>drm_sched_stop.
>>>>>> Best wishes
>>>>>> Emily Deng
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>> gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> tdr
>>>>>>>
>>>>>>> Hi Emily,
>>>>>>>
>>>>>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>>>
>>>>>>> When it's not the scheduler we shouldn't have a guilty job in the
>>>>>>> first
>>> place.
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>> Hi Chrisitan,
>>>>>>>>          No, I am with the new branch and also has the patch.
>>>>>>>> Even it are freed by
>>>>>>> main scheduler, how we could avoid main scheduler to free jobs
>>>>>>> while enter to function amdgpu_device_gpu_recover?
>>>>>>>> Best wishes
>>>>>>>> Emily Deng
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>> tdr
>>>>>>>>>
>>>>>>>>> Hi Emily,
>>>>>>>>>
>>>>>>>>> in this case you are on an old code branch.
>>>>>>>>>
>>>>>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>>>>>> timeout handler is running.
>>>>>>>>>
>>>>>>>>> See this patch here:
>>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>>
>>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>>> Hi Christian,
>>>>>>>>>>           Please refer to follow log, when it enter to
>>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>>> function, the bad job 000000005086879e is freeing in function
>>>>>>>>> amdgpu_job_free_cb  at the same time, because of the hardware
>>>>>>>>> fence
>>>>>>> signal.
>>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>>>>> s_fence is already freed, but job is not freed in time. Then
>>>>>>>>> this issue
>>>> occurs.
>>>>>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>ring
>>>>>>> sdma0
>>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process
>>> information:
>>>>>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>> 449.794163] amdgpu
>>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>>> 449.794221] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>> process pid 0 thread pid 0, s_job:0000000066eb74ab [
>>>>>>>>>> 449.794222] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>> process  pid 0 thread pid 0, s_job:00000000d4438ad9 [
>>>>>>>>>> 449.794255] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>> process  pid 0 thread pid 0, s_job:00000000b6d69c65 [
>>>>>>>>>> 449.794257] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>> process  pid 0 thread pid 0,
>>>>>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>> information: process  pid 0 thread  pid 0,
>>>>>>>>> s_job:00000000ed3a5ac6 [ 449.794366] BUG: unable to handle
>>>>>>>>> kernel NULL pointer dereference at
>>>>>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040]
>Oops:
>>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G
>>> OE
>>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>>>>> 449.803488]
>>>>>>> RIP:
>>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff
>>>>>>>>>> ff ff
>>>>>>>>>> 45 85 e4 0f
>>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40
>>>>>>>>> 10 <48> 8b
>>>>>>> 98
>>>>>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000
>RCX:
>>>>>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>0000000000000000
>>> [
>>>>>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148
>R12:
>>>>>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES:
>>>>>>>>>> 0000
>>>> CR0:
>>>>>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>> 0000000000000000
>>>> [
>>>>>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu]
>[
>>>>>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched]
>[
>>>>>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140
>[
>>>>>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>
>>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>>
>>>>>>>>>>> I can't follow how it would be possible for job->s_fence to
>>>>>>>>>>> be NULL without the job also being freed.
>>>>>>>>>>>
>>>>>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>> Best wishes
>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>
>>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>>>>>
>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org>
>On
>>>>>>> Behalf
>>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>> From: Christian König
><ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>>>>>>>>>>>> Then it will has null pointer in
>amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is
>>> destroyed.
>>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in
>>>>>>>>>>>>>>> one case, when it enter into the
>>>>>>>>>>>>>>> amdgpu_device_gpu_recover, it already in
>>>>>>>>>>>>>>> drm_sched_job_cleanup, and at this time, it will go to
>>>>>>>>>>>>>>> free
>>>>>>>>> job.
>>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster.
>At
>>>>>>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>>
>>>>>>>>>>>>>>               drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>               amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>>>               dma_fence_put(job->fence);
>>>>>>>>>>>>>>               amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>>               amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>>               kfree(job);
>>>>>>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>>>>>> reference to the
>>>>>>>>> s_fence.
>>>>>>>>>>>>> So you are just papering over a much bigger problem here.
>>>>>>>>>>>>> This patch is a clear NAK.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> When you see a job without an s_fence then that means
>>>>>>>>>>>>>>>> the problem is somewhere else.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>          drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |
>2 +-
>>>>>>>>>>>>>>>>>          drivers/gpu/drm/scheduler/sched_main.c     | 11
>>> ++++++-
>>>> ----
>>>>>>>>>>>>>>>>>          2 files changed, 7 insertions(+), 6
>>>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> diff --git
>a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>          	 *
>>>>>>>>>>>>>>>>>          	 * job->base holds a reference to parent fence
>>>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>>>> -	if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>>> +	if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>>> +job->base.s_fence->parent
>>>>>>>>>>> &&
>>>>>>>>>>>>>>>>>          	    dma_fence_is_signaled(job->base.s_fence-
>>parent))
>>>>>>>>>>>>>>>>>          		job_signaled = true;
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>          			spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>>
>	list_for_each_entry_safe(entity, tmp,
>>>>>>> &rq-
>>>>>>>>>>>> entities,
>>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>>> -				if (bad->s_fence-
>>>>> scheduled.context
>>>>>>>>>>> ==
>>>>>>>>>>>>>>>>> -				    entity->fence_context) {
>>>>>>>>>>>>>>>>> +				if (bad->s_fence && (bad-
>>>>> s_fence-
>>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>>> +				    entity->fence_context)) {
>>>>>>>>>>>>>>>>>          					if
>(atomic_read(&bad-
>>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>>          					    bad->sched-
>>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>>          						if
>(entity-
>>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>          	 * This iteration is thread safe as sched
>>>>>>>>>>>>>>>>> thread is
>>>>>>> stopped.
>>>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>>>>          	list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>>> -		if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>> +		if (s_job->s_fence && s_job->s_fence->parent
>>>> &&
>>>>>>>>>>>>>>>>>          		    dma_fence_remove_callback(s_job-
>>>>>>>> s_fence-
>>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>>          					      &s_job-
>>cb)) {
>>>>>>>>>>>>>>>>>          			atomic_dec(&sched-
>>hw_rq_count);
>>>>>>> @@ -
>>>>>>>>>>> 395,7
>>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>          			 *
>>>>>>>>>>>>>>>>>          			 * Job is still alive so fence
>refcount at
>>>>>>> least 1
>>>>>>>>>>>>>>>>>          			 */
>>>>>>>>>>>>>>>>> -			dma_fence_wait(&s_job->s_fence-
>>>>> finished,
>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>> +			if (s_job->s_fence)
>>>>>>>>>>>>>>>>> +				dma_fence_wait(&s_job-
>>>>> s_fence-
>>>>>>>>>>>> finished,
>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>          			/*
>>>>>>>>>>>>>>>>>          			 * We must keep bad job alive
>for later
>>>>>>> use
>>>>>>>>>>> during @@
>>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>> drm_gpu_scheduler
>>>>>>>>> *sched,
>>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>>          	 * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>>          	 */
>>>>>>>>>>>>>>>>>          	list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>> -		struct dma_fence *fence = s_job->s_fence-
>>>>> parent;
>>>>>>>>>>>>>>>>> +		struct dma_fence *fence = s_job->s_fence ?
>>>> s_job-
>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>          		atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-12  6:02                                                         ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-12  6:02 UTC (permalink / raw)
  To: Grodzovsky, Andrey, Koenig, Christian,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 21042 bytes --]

Hi Christian,
    I add the follow print in function drm_sched_cleanup_jobs. From the log it shows that only use cancel_delayed_work could not avoid to free job when the sched is in reset. But don't know exactly where it is wrong about the driver. Do you have any suggestion about this?

+       printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, pid:%lu\n", current->tgid, current->pid);

        /*
         * Don't destroy jobs while the timeout worker is running  OR thread
         * is being parked and hence assumed to not touch ring_mirror_list
         */
         if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
            !cancel_delayed_work(&sched->work_tdr)))
                return;
+       printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", current->tgid, current->pid);


Best wishes
Emily Deng

Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0 timeout, signaled seq=78585, emitted seq=78587
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: process  pid 0 thread  pid 0, s_job:00000000fe75ab36,tid=15603, pid=15603
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225428] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, pid:2253
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225494] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, pid:2262
>-----Original Message-----
>From: Grodzovsky, Andrey <Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
>Sent: Tuesday, November 12, 2019 11:28 AM
>To: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>; Deng, Emily
><Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Thinking more about this claim - we assume here that if cancel_delayed_work
>returned true it guarantees that timeout work is not running but, it merely
>means there was a pending timeout work which was removed from the
>workqueue before it's timer elapsed and so it didn't have a chance to be
>dequeued and executed, it doesn't cover already executing work. So there is a
>possibility where while timeout work started executing another timeout work
>already got enqueued (maybe through earlier cleanup jobs or through
>drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs
>cancel_delayed_work(&sched->work_tdr) will return true even while there is a
>timeout job in progress.
>Unfortunately we cannot change cancel_delayed_work to
>cancel_delayed_work_sync to flush the timeout work as timeout work itself
>waits for schedule thread  to be parked again when calling park_thread.
>
>Andrey
>
>________________________________________
>From: amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> on behalf of
>Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>Sent: 08 November 2019 05:35:18
>To: Deng, Emily; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Hi Emily,
>
>exactly that can't happen. See here:
>
>>         /* Don't destroy jobs while the timeout worker is running */
>>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>             !cancel_delayed_work(&sched->work_tdr))
>>                 return NULL;
>
>We never free jobs while the timeout working is running to prevent exactly
>that issue.
>
>Regards,
>Christian.
>
>Am 08.11.19 um 11:32 schrieb Deng, Emily:
>> Hi Christian,
>>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>> Sent: Friday, November 8, 2019 6:26 PM
>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Hi Emily,
>>>
>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>
>>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>> Hi Chrisitan,
>>>>        No, I am with the new branch and also has the patch. Even it
>>>> are freed by
>>> main scheduler, how we could avoid main scheduler to free jobs while
>>> enter to function amdgpu_device_gpu_recover?
>>>> Best wishes
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Hi Emily,
>>>>>
>>>>> in this case you are on an old code branch.
>>>>>
>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>> timeout handler is running.
>>>>>
>>>>> See this patch here:
>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>> Author: Christian König <christian.koenig-5C7GfCeVMHo@public.gmane.org>
>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>
>>>>>>       drm/scheduler: rework job destruction
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>> Hi Christian,
>>>>>>         Please refer to follow log, when it enter to
>>>>>> amdgpu_device_gpu_recover
>>>>> function, the bad job 000000005086879e is freeing in function
>>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>> signal.
>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>> s_fence is already freed, but job is not freed in time. Then this issue
>occurs.
>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>> sdma0
>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>>> amdgpu
>>>>> 0000:00:08.0: GPU reset begin!
>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> thread pid 0,
>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>> Emily:amdgpu_job_free_cb,Process
>>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>>>>> at
>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>> 0000 [#1] SMP PTI
>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>> 449.803488]
>>> RIP:
>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>>>> ff
>>>>>> 45 85 e4 0f
>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>> <48> 8b
>>> 98
>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000
>CR0:
>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> tdr
>>>>>>>
>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>> Sorry, please take your time.
>>>>>>> Have you seen my other response a bit below?
>>>>>>>
>>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>>> NULL without the job also being freed.
>>>>>>>
>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>> Best wishes
>>>>>>>> Emily Deng
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>>>>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>> tdr
>>>>>>>>>
>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>> Ping.....
>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>
>>>>>>>>>> Best wishes
>>>>>>>>>> Emily Deng
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> On
>>> Behalf
>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>; amd-
>>>>>>>>>>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>;
>>>>>>>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>
>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>>>>>>>>>>> go to free
>>>>> job.
>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>
>>>>>>>>>>             drm_sched_job_cleanup(s_job);
>>>>>>>>>>
>>>>>>>>>>             amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>             dma_fence_put(job->fence);
>>>>>>>>>>             amdgpu_sync_free(&job->sync);
>>>>>>>>>>             amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>             kfree(job);
>>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>>>>> to the
>>>>> s_fence.
>>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>>>> patch is a clear NAK.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng-5C7GfCeVMHo@public.gmane.org>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>>        drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>--
>>>>>>>>>>>>>        2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>             *
>>>>>>>>>>>>>             * job->base holds a reference to parent fence
>>>>>>>>>>>>>             */
>>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>> &&
>>>>>>>>>>>>>                dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>>                    job_signaled = true;
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>drm_sched_increase_karma(struct
>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>
>>>>>>>>>>>>>                            spin_lock(&rq->lock);
>>>>>>>>>>>>>                            list_for_each_entry_safe(entity,
>>>>>>>>>>>>> tmp,
>>> &rq-
>>>>>>>> entities,
>>>>>>>>>>>> list) {
>>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
>>>>>>> ==
>>>>>>>>>>>>> -                              entity->fence_context) {
>>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>> +                              entity->fence_context)) {
>>>>>>>>>>>>>                                            if
>>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>> karma) >
>>>>>>>>>>>>>                                                bad->sched-
>>>> hang_limit)
>>>>>>>>>>>>>                                                    if
>>>>>>>>>>>>> (entity-
>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>             * This iteration is thread safe as sched thread
>>>>>>>>>>>>> is
>>> stopped.
>>>>>>>>>>>>>             */
>>>>>>>>>>>>>            list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>>>>>>>                        dma_fence_remove_callback(s_job-
>>>> s_fence-
>>>>>>>> parent,
>>>>>>>>>>>>>                                                  &s_job->cb)) {
>>>>>>>>>>>>>                            atomic_dec(&sched->hw_rq_count);
>>> @@ -
>>>>>>> 395,7
>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>                             *
>>>>>>>>>>>>>                             * Job is still alive so fence
>>>>>>>>>>>>> refcount at
>>> least 1
>>>>>>>>>>>>>                             */
>>>>>>>>>>>>> -                  dma_fence_wait(&s_job->s_fence->finished,
>>>>>>> false);
>>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>>>>>>>>>> +                          dma_fence_wait(&s_job->s_fence-
>>>>>>>> finished,
>>>>>>>>>>>> false);
>>>>>>>>>>>>>                            /*
>>>>>>>>>>>>>                             * We must keep bad job alive
>>>>>>>>>>>>> for later
>>> use
>>>>>>> during @@
>>>>>>>>>>>> -438,7
>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>> *sched,
>>>>>>>>>>>>> +bool
>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>             * GPU recovers can't run in parallel.
>>>>>>>>>>>>>             */
>>>>>>>>>>>>>            list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>> node)
>>>>>>>>>>>> {
>>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ?
>>>>>>>>>>>>> + s_job-
>>>>>>>> s_fence-
>>>>>>>>>>>>> parent :
>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>
>>>>>>>>>>>>>                    atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>
>>>>>>>>>>> _______________________________________________
>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>_______________________________________________
>amd-gfx mailing list
>amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[-- Attachment #1.2: Type: text/html, Size: 41546 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-12  6:02                                                         ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-12  6:02 UTC (permalink / raw)
  To: Grodzovsky, Andrey, Koenig, Christian, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 20340 bytes --]

Hi Christian,
    I add the follow print in function drm_sched_cleanup_jobs. From the log it shows that only use cancel_delayed_work could not avoid to free job when the sched is in reset. But don't know exactly where it is wrong about the driver. Do you have any suggestion about this?

+       printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, pid:%lu\n", current->tgid, current->pid);

        /*
         * Don't destroy jobs while the timeout worker is running  OR thread
         * is being parked and hence assumed to not touch ring_mirror_list
         */
         if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
            !cancel_delayed_work(&sched->work_tdr)))
                return;
+       printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", current->tgid, current->pid);


Best wishes
Emily Deng

Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0 timeout, signaled seq=78585, emitted seq=78587
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: process  pid 0 thread  pid 0, s_job:00000000fe75ab36,tid=15603, pid=15603
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225428] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, pid:2253
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: [11381.225494] Emily:amdgpu_job_free_cb,Process information: process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, pid:2262
>-----Original Message-----
>From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>Sent: Tuesday, November 12, 2019 11:28 AM
>To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Thinking more about this claim - we assume here that if cancel_delayed_work
>returned true it guarantees that timeout work is not running but, it merely
>means there was a pending timeout work which was removed from the
>workqueue before it's timer elapsed and so it didn't have a chance to be
>dequeued and executed, it doesn't cover already executing work. So there is a
>possibility where while timeout work started executing another timeout work
>already got enqueued (maybe through earlier cleanup jobs or through
>drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs
>cancel_delayed_work(&sched->work_tdr) will return true even while there is a
>timeout job in progress.
>Unfortunately we cannot change cancel_delayed_work to
>cancel_delayed_work_sync to flush the timeout work as timeout work itself
>waits for schedule thread  to be parked again when calling park_thread.
>
>Andrey
>
>________________________________________
>From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of
>Koenig, Christian <Christian.Koenig@amd.com>
>Sent: 08 November 2019 05:35:18
>To: Deng, Emily; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Hi Emily,
>
>exactly that can't happen. See here:
>
>>         /* Don't destroy jobs while the timeout worker is running */
>>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>             !cancel_delayed_work(&sched->work_tdr))
>>                 return NULL;
>
>We never free jobs while the timeout working is running to prevent exactly
>that issue.
>
>Regards,
>Christian.
>
>Am 08.11.19 um 11:32 schrieb Deng, Emily:
>> Hi Christian,
>>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Friday, November 8, 2019 6:26 PM
>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Hi Emily,
>>>
>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>
>>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>> Hi Chrisitan,
>>>>        No, I am with the new branch and also has the patch. Even it
>>>> are freed by
>>> main scheduler, how we could avoid main scheduler to free jobs while
>>> enter to function amdgpu_device_gpu_recover?
>>>> Best wishes
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Hi Emily,
>>>>>
>>>>> in this case you are on an old code branch.
>>>>>
>>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>> timeout handler is running.
>>>>>
>>>>> See this patch here:
>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>
>>>>>>       drm/scheduler: rework job destruction
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>> Hi Christian,
>>>>>>         Please refer to follow log, when it enter to
>>>>>> amdgpu_device_gpu_recover
>>>>> function, the bad job 000000005086879e is freeing in function
>>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>> signal.
>>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>> s_fence is already freed, but job is not freed in time. Then this issue
>occurs.
>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>> sdma0
>>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>>> amdgpu
>>>>> 0000:00:08.0: GPU reset begin!
>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> thread pid 0,
>>>>> s_job:00000000ea85e922 [  449.794287]
>>>>> Emily:amdgpu_job_free_cb,Process
>>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>>>>> at
>>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>> 0000 [#1] SMP PTI
>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>> 449.803488]
>>> RIP:
>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>>>> ff
>>>>>> 45 85 e4 0f
>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>> <48> 8b
>>> 98
>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>> 0000000000000000 [  449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>> 0000000000000000 [  449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000
>CR0:
>>>>>> 0000000080050033 [  449.810153] CR2: 00000000000000c0 CR3:
>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>> 0000000000000400 [  449.811937] Call Trace:
>>>>>> [  449.812206]  amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>> 449.812635]  drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>> 449.814077]  process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>> [  449.815799]  ret_from_fork+0x35/0x40
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>> gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> tdr
>>>>>>>
>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>> Sorry, please take your time.
>>>>>>> Have you seen my other response a bit below?
>>>>>>>
>>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>>> NULL without the job also being freed.
>>>>>>>
>>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>> Best wishes
>>>>>>>> Emily Deng
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>> gfx@lists.freedesktop.org
>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>>>> tdr
>>>>>>>>>
>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>> Ping.....
>>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>>>
>>>>>>>>>> Best wishes
>>>>>>>>>> Emily Deng
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>> Behalf
>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>
>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>>>>>>>>>>> go to free
>>>>> job.
>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>
>>>>>>>>>>             drm_sched_job_cleanup(s_job);
>>>>>>>>>>
>>>>>>>>>>             amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>             dma_fence_put(job->fence);
>>>>>>>>>>             amdgpu_sync_free(&job->sync);
>>>>>>>>>>             amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>             kfree(job);
>>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>>>>> to the
>>>>> s_fence.
>>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>>>> patch is a clear NAK.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>>>>>>>        drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>--
>>>>>>>>>>>>>        2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>             *
>>>>>>>>>>>>>             * job->base holds a reference to parent fence
>>>>>>>>>>>>>             */
>>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>> &&
>>>>>>>>>>>>>                dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>>>>>>>                    job_signaled = true;
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>drm_sched_increase_karma(struct
>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>
>>>>>>>>>>>>>                            spin_lock(&rq->lock);
>>>>>>>>>>>>>                            list_for_each_entry_safe(entity,
>>>>>>>>>>>>> tmp,
>>> &rq-
>>>>>>>> entities,
>>>>>>>>>>>> list) {
>>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
>>>>>>> ==
>>>>>>>>>>>>> -                              entity->fence_context) {
>>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>> +                              entity->fence_context)) {
>>>>>>>>>>>>>                                            if
>>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>> karma) >
>>>>>>>>>>>>>                                                bad->sched-
>>>> hang_limit)
>>>>>>>>>>>>>                                                    if
>>>>>>>>>>>>> (entity-
>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>             * This iteration is thread safe as sched thread
>>>>>>>>>>>>> is
>>> stopped.
>>>>>>>>>>>>>             */
>>>>>>>>>>>>>            list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>>>>>>>>                        dma_fence_remove_callback(s_job-
>>>> s_fence-
>>>>>>>> parent,
>>>>>>>>>>>>>                                                  &s_job->cb)) {
>>>>>>>>>>>>>                            atomic_dec(&sched->hw_rq_count);
>>> @@ -
>>>>>>> 395,7
>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>                             *
>>>>>>>>>>>>>                             * Job is still alive so fence
>>>>>>>>>>>>> refcount at
>>> least 1
>>>>>>>>>>>>>                             */
>>>>>>>>>>>>> -                  dma_fence_wait(&s_job->s_fence->finished,
>>>>>>> false);
>>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>>>>>>>>>> +                          dma_fence_wait(&s_job->s_fence-
>>>>>>>> finished,
>>>>>>>>>>>> false);
>>>>>>>>>>>>>                            /*
>>>>>>>>>>>>>                             * We must keep bad job alive
>>>>>>>>>>>>> for later
>>> use
>>>>>>> during @@
>>>>>>>>>>>> -438,7
>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>> *sched,
>>>>>>>>>>>>> +bool
>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>             * GPU recovers can't run in parallel.
>>>>>>>>>>>>>             */
>>>>>>>>>>>>>            list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>> node)
>>>>>>>>>>>> {
>>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ?
>>>>>>>>>>>>> + s_job-
>>>>>>>> s_fence-
>>>>>>>>>>>>> parent :
>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>
>>>>>>>>>>>>>                    atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>
>>>>>>>>>>> _______________________________________________
>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>_______________________________________________
>amd-gfx mailing list
>amd-gfx@lists.freedesktop.org
>https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[-- Attachment #1.2: Type: text/html, Size: 40933 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-12 19:21                                                                 ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-12 19:21 UTC (permalink / raw)
  To: Christian König, Deng, Emily,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 23705 bytes --]

I was able to reproduce the crash by using the attached 
simulate_crash.patch - waiting on guilty job to signal in reset work and 
artificially rearming the timeout timer just before the check for 
!cancel_delayed_work(&sched->work_tdr)  in drm_sched_cleanup_jobs - 
crash log attached in crash.log. This I think confirms my theory i 
described earlier in this thread.

basic_fix.patch handles this by testing whether another timer already 
armed ob this scheduler or is there a timeout work in execution right 
now (see documentation for work_busy) - obviously this is not a full 
solution as this will not protect from races if for example there is 
immediate work scheduling such as in drm_sched_fault -  so we probably 
need to account for this by making drm_sched_cleanup_jobs (at least in 
the part where it iterates ring mirror list and frees jobs) and GPU 
reset really mutually exclusive and not like now.

Andrey


On 11/11/19 4:11 PM, Christian König wrote:
> Hi Emily,
>
> you need to print which scheduler instance is freeing the jobs and 
> which one is triggering the reset. The TID and PID is completely 
> meaningless here since we are called from different worker threads and 
> the TID/PID can change on each call.
>
> Apart from that I will look into this a bit deeper when I have time.
>
> Regards,
> Christian.
>
> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>> Hi Christian,
>>     I add the follow print in function drm_sched_cleanup_jobs. From 
>> the log it shows that only use cancel_delayed_work could not avoid to 
>> free job when the sched is in reset. But don’t know exactly where it 
>> is wrong about the driver. Do you have any suggestion about this?
>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, pid:%lu\n", 
>> current->tgid, current->pid);
>>         /*
>>          * Don't destroy jobs while the timeout worker is running  OR 
>> thread
>>          * is being parked and hence assumed to not touch 
>> ring_mirror_list
>>          */
>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>> !cancel_delayed_work(&sched->work_tdr)))
>>                 return;
>> +       printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
>> current->tgid, current->pid);
>> Best wishes
>> Emily Deng
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0 
>> timeout, signaled seq=78585, emitted seq=78587
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
>> information: process  pid 0 thread  pid 0, 
>> s_job:00000000fe75ab36,tid=15603, pid=15603
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225428] Emily:amdgpu_job_free_cb,Process information: process  
>> pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, pid:2253
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225494] Emily:amdgpu_job_free_cb,Process information: process  
>> pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, pid:2262
>> >-----Original Message-----
>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
>> >Sent: Tuesday, November 12, 2019 11:28 AM
>> >To: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>; Deng, Emily
>> ><Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>> >
>> >Thinking more about this claim - we assume here that if cancel_delayed_work
>> >returned true it guarantees that timeout work is not running but, it merely
>> >means there was a pending timeout work which was removed from the
>> >workqueue before it's timer elapsed and so it didn't have a chance to be
>> >dequeued and executed, it doesn't cover already executing work. So there is a
>> >possibility where while timeout work started executing another timeout work
>> >already got enqueued (maybe through earlier cleanup jobs or through
>> >drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs
>> >cancel_delayed_work(&sched->work_tdr) will return true even while there is a
>> >timeout job in progress.
>> >Unfortunately we cannot change cancel_delayed_work to
>> >cancel_delayed_work_sync to flush the timeout work as timeout work itself
>> >waits for schedule thread  to be parked again when calling park_thread.
>> >
>> >Andrey
>> >
>> >________________________________________
>> >From: amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> on behalf of
>> >Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>> >Sent: 08 November 2019 05:35:18
>> >To: Deng, Emily; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>> >
>> >Hi Emily,
>> >
>> >exactly that can't happen. See here:
>> >
>> >>         /* Don't destroy jobs while the timeout worker is running */
>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>> >>            !cancel_delayed_work(&sched->work_tdr))
>> >>                 return NULL;
>> >
>> >We never free jobs while the timeout working is running to prevent exactly
>> >that issue.
>> >
>> >Regards,
>> >Christian.
>> >
>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>> >> Hi Christian,
>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>> >amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>> >>
>> >> Best wishes
>> >> Emily Deng
>> >>
>> >>
>> >>
>> >>> -----Original Message-----
>> >>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>> >>> Sent: Friday, November 8, 2019 6:26 PM
>> >>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>> >>>
>> >>> Hi Emily,
>> >>>
>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>> >>>
>> >>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>> >>>
>> >>> Regards,
>> >>> Christian.
>> >>>
>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>> >>>> Hi Chrisitan,
>> >>>>        No, I am with the new branch and also has the patch. Even it
>> >>>> are freed by
>> >>> main scheduler, how we could avoid main scheduler to free jobs while
>> >>> enter to function amdgpu_device_gpu_recover?
>> >>>> Best wishes
>> >>>> Emily Deng
>> >>>>
>> >>>>
>> >>>>
>> >>>>> -----Original Message-----
>> >>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>> >>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>> >gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>> >>>>>
>> >>>>> Hi Emily,
>> >>>>>
>> >>>>> in this case you are on an old code branch.
>> >>>>>
>> >>>>> Jobs are freed now by the main scheduler thread and only if no
>> >>>>> timeout handler is running.
>> >>>>>
>> >>>>> See this patch here:
>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>> >>>>>> Author: Christian König <christian.koenig-5C7GfCeVMHo@public.gmane.org>
>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>> >>>>>>
>> >>>>>>       drm/scheduler: rework job destruction
>> >>>>> Regards,
>> >>>>> Christian.
>> >>>>>
>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>> >>>>>> Hi Christian,
>> >>>>>>         Please refer to follow log, when it enter to
>> >>>>>> amdgpu_device_gpu_recover
>> >>>>> function, the bad job 000000005086879e is freeing in function
>> >>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>> >>> signal.
>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>> >>>>> s_fence is already freed, but job is not freed in time. Then this issue
>> >occurs.
>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>> >>> sdma0
>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>> >>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>> >>>>> amdgpu
>> >>>>> 0000:00:08.0: GPU reset begin!
>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>> >>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>> >>>>>> thread pid 0,
>> >>>>> s_job:00000000ea85e922 [ 449.794287]
>> >>>>> Emily:amdgpu_job_free_cb,Process
>> >>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>> >>>>> at
>> >>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>> >>>>> 0000 [#1] SMP PTI
>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>> >>>>>> 449.803488]
>> >>> RIP:
>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>> >>>>>> ff
>> >>>>>> 45 85 e4 0f
>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>> >>>>> <48> 8b
>> >>> 98
>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0 R14:
>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>> >>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000 ES: 0000
>> >CR0:
>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0 CR3:
>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>> >>>>> 0000000000000400 [  449.811937] Call Trace:
>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>> >>>>>> worker_thread+0x34/0x410 [ 449.814728]  kthread+0x121/0x140 [
>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>> >>>>>>
>> >>>>>>> -----Original Message-----
>> >>>>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>> >>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>> >>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>> >>>>>>> tdr
>> >>>>>>>
>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>> >>>>>>>> Sorry, please take your time.
>> >>>>>>> Have you seen my other response a bit below?
>> >>>>>>>
>> >>>>>>> I can't follow how it would be possible for job->s_fence to be
>> >>>>>>> NULL without the job also being freed.
>> >>>>>>>
>> >>>>>>> So it looks like this patch is just papering over some bigger issues.
>> >>>>>>>
>> >>>>>>> Regards,
>> >>>>>>> Christian.
>> >>>>>>>
>> >>>>>>>> Best wishes
>> >>>>>>>> Emily Deng
>> >>>>>>>>
>> >>>>>>>>
>> >>>>>>>>
>> >>>>>>>>> -----Original Message-----
>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>> >>>>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>> >>>>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>> >>>>>>>>> tdr
>> >>>>>>>>>
>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>> >>>>>>>>>> Ping.....
>> >>>>>>>>> You need to give me at least enough time to wake up :)
>> >>>>>>>>>
>> >>>>>>>>>> Best wishes
>> >>>>>>>>>> Emily Deng
>> >>>>>>>>>>
>> >>>>>>>>>>
>> >>>>>>>>>>
>> >>>>>>>>>>> -----Original Message-----
>> >>>>>>>>>>> From: amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> On
>> >>> Behalf
>> >>>>>>>>>>> Of Deng, Emily
>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>; amd-
>> >>>>>>>>>>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>> >>>>>>>>>>> for tdr
>> >>>>>>>>>>>
>> >>>>>>>>>>>> -----Original Message-----
>> >>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>;
>> >>>>>>>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>> >>>>>>>>>>>> for tdr
>> >>>>>>>>>>>>
>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>> >>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>> >>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>> >>>>>>>>>>> go to free
>> >>>>> job.
>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>> >>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>> >>>>>>>>> No, that case can't happen. See here:
>> >>>>>>>>>
>> >>>>>>>>>>            drm_sched_job_cleanup(s_job);
>> >>>>>>>>>>
>> >>>>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
>> >>>>>>>>>>            dma_fence_put(job->fence);
>> >>>>>>>>>>            amdgpu_sync_free(&job->sync);
>> >>>>>>>>>>            amdgpu_sync_free(&job->sched_sync);
>> >>>>>>>>>>            kfree(job);
>> >>>>>>>>> The job itself is freed up directly after freeing the reference
>> >>>>>>>>> to the
>> >>>>> s_fence.
>> >>>>>>>>> So you are just papering over a much bigger problem here. This
>> >>>>>>>>> patch is a clear NAK.
>> >>>>>>>>>
>> >>>>>>>>> Regards,
>> >>>>>>>>> Christian.
>> >>>>>>>>>
>> >>>>>>>>>>>> When you see a job without an s_fence then that means the
>> >>>>>>>>>>>> problem is somewhere else.
>> >>>>>>>>>>>>
>> >>>>>>>>>>>> Regards,
>> >>>>>>>>>>>> Christian.
>> >>>>>>>>>>>>
>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng-5C7GfCeVMHo@public.gmane.org>
>> >>>>>>>>>>>>> ---
>> >>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>> >>>>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>> >--
>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>> >>> amdgpu_device_gpu_recover(struct
>> >>>>>>>>>>>> amdgpu_device *adev,
>> >>>>>>>>>>>>>            *
>> >>>>>>>>>>>>>            * job->base holds a reference to parent fence
>> >>>>>>>>>>>>>            */
>> >>>>>>>>>>>>> - if (job && job->base.s_fence->parent &&
>> >>>>>>>>>>>>> + if (job && job->base.s_fence &&
>> >>>>>>>>>>>>> + job->base.s_fence->parent
>> >>>>>>> &&
>> >>>>>>>>>>>>>               dma_fence_is_signaled(job->base.s_fence->parent))
>> >>>>>>>>>>>>>                   job_signaled = true;
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>> >drm_sched_increase_karma(struct
>> >>>>>>>>>>>> drm_sched_job
>> >>>>>>>>>>>>> *bad)
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>>                           spin_lock(&rq->lock);
>> >>>>>>>>>>>>>                           list_for_each_entry_safe(entity,
>> >>>>>>>>>>>>> tmp,
>> >>> &rq-
>> >>>>>>>> entities,
>> >>>>>>>>>>>> list) {
>> >>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
>> >>>>>>> ==
>> >>>>>>>>>>>>> -                              entity->fence_context) {
>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>> >>>>>>>>>>>>> + (bad->s_fence-
>> >>>>>>>>>>>>> scheduled.context ==
>> >>>>>>>>>>>>> +                              entity->fence_context)) {
>> >>>>>>>>>>>>>                                           if
>> >>>>>>>>>>>>> (atomic_read(&bad-
>> >>>>>>>> karma) >
>> >>>>>>>>>>>>>                                               bad->sched-
>> >>>> hang_limit)
>> >>>>>>>>>>>>>                                                   if
>> >>>>>>>>>>>>> (entity-
>> >>>> guilty) @@ -376,7 +376,7 @@ void
>> >>>>>>>>>>>>> drm_sched_stop(struct
>> >>>>>>> drm_gpu_scheduler
>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>> >>>>>>>>>>>>>            * This iteration is thread safe as sched thread
>> >>>>>>>>>>>>> is
>> >>> stopped.
>> >>>>>>>>>>>>>            */
>> >>>>>>>>>>>>>           list_for_each_entry_safe_reverse(s_job, tmp,
>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>> >>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
>> >>>>>>>>>>>>>                       dma_fence_remove_callback(s_job-
>> >>>> s_fence-
>> >>>>>>>> parent,
>> >>>>>>>>>>>>>                                                 &s_job->cb)) {
>> >>>>>>>>>>>>>                           atomic_dec(&sched->hw_rq_count);
>> >>> @@ -
>> >>>>>>> 395,7
>> >>>>>>>>>>> +395,8 @@ void
>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>> >>>>>>>>>>>>>                            *
>> >>>>>>>>>>>>>                            * Job is still alive so fence
>> >>>>>>>>>>>>> refcount at
>> >>> least 1
>> >>>>>>>>>>>>>                            */
>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>> >>>>>>> false);
>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>> >>>>>>>> finished,
>> >>>>>>>>>>>> false);
>> >>>>>>>>>>>>>                           /*
>> >>>>>>>>>>>>>                            * We must keep bad job alive
>> >>>>>>>>>>>>> for later
>> >>> use
>> >>>>>>> during @@
>> >>>>>>>>>>>> -438,7
>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>> >>>>> *sched,
>> >>>>>>>>>>>>> +bool
>> >>>>>>>>>>>> full_recovery)
>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>> >>>>>>>>>>>>>            */
>> >>>>>>>>>>>>>           list_for_each_entry_safe(s_job, tmp,
>> >>>>>>>>>>>>> &sched->ring_mirror_list,
>> >>>>>>>>>>>>> node)
>> >>>>>>>>>>>> {
>> >>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
>> >>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ?
>> >>>>>>>>>>>>> + s_job-
>> >>>>>>>> s_fence-
>> >>>>>>>>>>>>> parent :
>> >>>>>>>>>>>>> +NULL;
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>>                   atomic_inc(&sched->hw_rq_count);
>> >>>>>>>>>>>>>
>> >>>>>>>>>>> _______________________________________________
>> >>>>>>>>>>> amd-gfx mailing list
>> >>>>>>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> >>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>> >
>> >_______________________________________________
>> >amd-gfx mailing list
>> >amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>

[-- Attachment #1.2: Type: text/html, Size: 56555 bytes --]

[-- Attachment #2: basic_fix.patch --]
[-- Type: text/x-patch, Size: 647 bytes --]

@@ -647,14 +653,28 @@ static void drm_sched_cleanup_jobs(struct drm_gpu_scheduler *sched)
 {
        unsigned long flags;
 
+
        /*
         * Don't destroy jobs while the timeout worker is running  OR thread
         * is being parked and hence assumed to not touch ring_mirror_list
         */

        if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
-           !cancel_delayed_work(&sched->work_tdr)) ||
-           __kthread_should_park(sched->thread)) {
+               return;
+
+       }
+
+       if (work_busy(&sched->work_tdr.work)) {
+               DRM_ERROR("DRM_ERROR work_busy - returning");
                return;
+       }


[-- Attachment #3: simulate_crash.patch --]
[-- Type: text/x-patch, Size: 1473 bytes --]

@@ -647,14 +653,28 @@ static void drm_sched_cleanup_jobs(struct drm_gpu_scheduler *sched)
 {
        unsigned long flags;
 
+
        /*
         * Don't destroy jobs while the timeout worker is running  OR thread
         * is being parked and hence assumed to not touch ring_mirror_list
         */
+       DRM_ERROR("Sched name %s", sched->name);
+       if (!strcmp("gfx", sched->name)) {
+               spin_lock_irqsave(&sched->job_list_lock, flags);
+               drm_sched_start_timeout(sched);
+               spin_unlock_irqrestore(&sched->job_list_lock, flags);
+       }
        if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
-           !cancel_delayed_work(&sched->work_tdr)) ||
-           __kthread_should_park(sched->thread))
+           !cancel_delayed_work(&sched->work_tdr))) {
+               DRM_ERROR("DRM_ERROR cancel_delayed_work false - returning");
+               return;
+
+       }




--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4007,6 +4007,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                device_list_handle = &device_list;
        }
 
+       DRM_ERROR("Waiting for bad job to finish %p", &job->base);
+       dma_fence_wait(&job->base.s_fence->finished, false);
        /* block all schedulers and reset given job's ring */
        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
                if (tmp_adev != adev) {









[-- Attachment #4: crash.log --]
[-- Type: text/x-log, Size: 5705 bytes --]

[   32.270341 <    0.001165>] #PF: supervisor read access in kernel mode
[   32.270934 <    0.000593>] #PF: error_code(0x0000) - not-present page
[   32.271520 <    0.000586>] PGD 8000000460346067 P4D 8000000460346067 PUD 460345067 PMD 0 
[   32.272108 <    0.000588>] Oops: 0000 [#1] SMP PTI
[   32.272695 <    0.000587>] CPU: 5 PID: 99 Comm: kworker/5:1 Tainted: G           OE     5.3.0-rc3-test-kfd+ #8
[   32.273288 <    0.000593>] Hardware name: System manufacturer System Product Name/Z170-PRO, BIOS 1902 06/27/2016
[   32.273881 <    0.000593>] Workqueue: events drm_sched_job_timedout [gpu_sched]
[   32.274505 <    0.000624>] RIP: 0010:amdgpu_device_gpu_recover+0x319/0xc20 [amdgpu]
[   32.275101 <    0.000596>] Code: 30 c6 44 24 2f 00 48 c7 c2 08 a4 66 c0 48 89 6c 24 30 48 89 6c 24 38 e9 60 fd ff ff 48 8b 44 24 10 48 85 c0 74 56 48 8b 40 10 <48> 8b 98 80 00 00 00 48 85 db 74 46 48 8b 43 30 a8 01 75 1e 48 8b
[   32.276401 <    0.001300>] RSP: 0018:ffffa6940036bd40 EFLAGS: 00010286
[   32.277055 <    0.000654>] RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffa69400369960
[   32.277712 <    0.000657>] RDX: 0000000000000000 RSI: ffff8a1b23a620b0 RDI: ffff8a1b23a61800
[   32.278357 <    0.000645>] RBP: ffffa6940036bd70 R08: 0000000000000000 R09: 0000000000000001
[   32.278999 <    0.000642>] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8a1b1fa266e8
[   32.279636 <    0.000637>] R13: 0000000000000000 R14: ffff8a1b20140c00 R15: ffff8a1b1fa266e8
[   32.280276 <    0.000640>] FS:  0000000000000000(0000) GS:ffff8a1b26340000(0000) knlGS:0000000000000000
[   32.280925 <    0.000649>] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   32.281567 <    0.000642>] CR2: 0000000000000080 CR3: 0000000460f50006 CR4: 00000000003606e0
[   32.282211 <    0.000644>] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   32.282860 <    0.000649>] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   32.283503 <    0.000643>] Call Trace:
[   32.284182 <    0.000679>]  amdgpu_job_timedout+0x123/0x140 [amdgpu]
[   32.284819 <    0.000637>]  ? drm_sched_job_timedout+0x4d/0xb0 [gpu_sched]
[   32.285451 <    0.000632>]  drm_sched_job_timedout+0x4d/0xb0 [gpu_sched]
[   32.286086 <    0.000635>]  process_one_work+0x1f1/0x600
[   32.286723 <    0.000637>]  worker_thread+0x4c/0x430
[   32.287361 <    0.000638>]  ? process_one_work+0x600/0x600
[   32.287993 <    0.000632>]  kthread+0x101/0x140
[   32.288616 <    0.000623>]  ? kthread_cancel_delayed_work_sync+0x10/0x10
[   32.289250 <    0.000634>]  ret_from_fork+0x24/0x30
[   32.289886 <    0.000636>] Modules linked in: amdgpu(OE) amd_iommu_v2 gpu_sched(OE) ttm(OE) x86_pkg_temp_thermal video acpi_pad
[   32.290549 <    0.000663>] CR2: 0000000000000080
[   32.291212 <    0.000663>] ---[ end trace 8a8ed840f9d52369 ]---
[   32.291902 <    0.000690>] RIP: 0010:amdgpu_device_gpu_recover+0x319/0xc20 [amdgpu]
[   32.292582 <    0.000680>] Code: 30 c6 44 24 2f 00 48 c7 c2 08 a4 66 c0 48 89 6c 24 30 48 89 6c 24 38 e9 60 fd ff ff 48 8b 44 24 10 48 85 c0 74 56 48 8b 40 10 <48> 8b 98 80 00 00 00 48 85 db 74 46 48 8b 43 30 a8 01 75 1e 48 8b
[   32.294026 <    0.001444>] RSP: 0018:ffffa6940036bd40 EFLAGS: 00010286
[   32.294751 <    0.000725>] RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffa69400369960
[   32.295478 <    0.000727>] RDX: 0000000000000000 RSI: ffff8a1b23a620b0 RDI: ffff8a1b23a61800
[   32.296200 <    0.000722>] RBP: ffffa6940036bd70 R08: 0000000000000000 R09: 0000000000000001
[   32.296920 <    0.000720>] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8a1b1fa266e8
[   32.297634 <    0.000714>] R13: 0000000000000000 R14: ffff8a1b20140c00 R15: ffff8a1b1fa266e8
[   32.298341 <    0.000707>] FS:  0000000000000000(0000) GS:ffff8a1b26340000(0000) knlGS:0000000000000000
[   32.299047 <    0.000706>] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   32.299749 <    0.000702>] CR2: 0000000000000080 CR3: 0000000460f50006 CR4: 00000000003606e0
[   32.300452 <    0.000703>] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   32.301151 <    0.000699>] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   32.301845 <    0.000694>] BUG: sleeping function called from invalid context at ./include/linux/percpu-rwsem.h:38
[   32.302543 <    0.000698>] in_atomic(): 0, irqs_disabled(): 1, pid: 99, name: kworker/5:1
[   32.303239 <    0.000696>] INFO: lockdep is turned off.
[   32.303928 <    0.000689>] irq event stamp: 121070
[   32.304610 <    0.000682>] hardirqs last  enabled at (121069): [<ffffffff8409a8f9>] cancel_delayed_work+0x69/0xa0
[   32.305303 <    0.000693>] hardirqs last disabled at (121070): [<ffffffff84001c5a>] trace_hardirqs_off_thunk+0x1a/0x20
[   32.305994 <    0.000691>] softirqs last  enabled at (120778): [<ffffffff8500036a>] __do_softirq+0x36a/0x425
[   32.306678 <    0.000684>] softirqs last disabled at (120771): [<ffffffff8407c943>] irq_exit+0xb3/0xc0
[   32.307358 <    0.000680>] CPU: 5 PID: 99 Comm: kworker/5:1 Tainted: G      D    OE     5.3.0-rc3-test-kfd+ #8
[   32.308044 <    0.000686>] Hardware name: System manufacturer System Product Name/Z170-PRO, BIOS 1902 06/27/2016
[   32.308734 <    0.000690>] Workqueue: events drm_sched_job_timedout [gpu_sched]
[   32.309419 <    0.000685>] Call Trace:
[   32.310097 <    0.000678>]  dump_stack+0x5e/0x8b
[   32.310767 <    0.000670>]  ___might_sleep+0x20c/0x240
[   32.311429 <    0.000662>]  exit_signals+0x30/0x340
[   32.312086 <    0.000657>]  do_exit+0xc0/0xc80
[   32.312710 <    0.000624>]  ? process_one_work+0x600/0x600
[   32.313300 <    0.000590>]  ? kthread+0x101/0x140
[   32.313884 <    0.000584>]  rewind_stack_do_exit+0x17/0x20


[-- Attachment #5: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-12 19:21                                                                 ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-12 19:21 UTC (permalink / raw)
  To: Christian König, Deng, Emily, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 23003 bytes --]

I was able to reproduce the crash by using the attached 
simulate_crash.patch - waiting on guilty job to signal in reset work and 
artificially rearming the timeout timer just before the check for 
!cancel_delayed_work(&sched->work_tdr)  in drm_sched_cleanup_jobs - 
crash log attached in crash.log. This I think confirms my theory i 
described earlier in this thread.

basic_fix.patch handles this by testing whether another timer already 
armed ob this scheduler or is there a timeout work in execution right 
now (see documentation for work_busy) - obviously this is not a full 
solution as this will not protect from races if for example there is 
immediate work scheduling such as in drm_sched_fault -  so we probably 
need to account for this by making drm_sched_cleanup_jobs (at least in 
the part where it iterates ring mirror list and frees jobs) and GPU 
reset really mutually exclusive and not like now.

Andrey


On 11/11/19 4:11 PM, Christian König wrote:
> Hi Emily,
>
> you need to print which scheduler instance is freeing the jobs and 
> which one is triggering the reset. The TID and PID is completely 
> meaningless here since we are called from different worker threads and 
> the TID/PID can change on each call.
>
> Apart from that I will look into this a bit deeper when I have time.
>
> Regards,
> Christian.
>
> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>> Hi Christian,
>>     I add the follow print in function drm_sched_cleanup_jobs. From 
>> the log it shows that only use cancel_delayed_work could not avoid to 
>> free job when the sched is in reset. But don’t know exactly where it 
>> is wrong about the driver. Do you have any suggestion about this?
>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, pid:%lu\n", 
>> current->tgid, current->pid);
>>         /*
>>          * Don't destroy jobs while the timeout worker is running  OR 
>> thread
>>          * is being parked and hence assumed to not touch 
>> ring_mirror_list
>>          */
>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>> !cancel_delayed_work(&sched->work_tdr)))
>>                 return;
>> +       printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
>> current->tgid, current->pid);
>> Best wishes
>> Emily Deng
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0 
>> timeout, signaled seq=78585, emitted seq=78587
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
>> information: process  pid 0 thread  pid 0, 
>> s_job:00000000fe75ab36,tid=15603, pid=15603
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225428] Emily:amdgpu_job_free_cb,Process information: process  
>> pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, pid:2253
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>> [11381.225494] Emily:amdgpu_job_free_cb,Process information: process  
>> pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, pid:2262
>> >-----Original Message-----
>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>> >Sent: Tuesday, November 12, 2019 11:28 AM
>> >To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>> ><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>> >
>> >Thinking more about this claim - we assume here that if cancel_delayed_work
>> >returned true it guarantees that timeout work is not running but, it merely
>> >means there was a pending timeout work which was removed from the
>> >workqueue before it's timer elapsed and so it didn't have a chance to be
>> >dequeued and executed, it doesn't cover already executing work. So there is a
>> >possibility where while timeout work started executing another timeout work
>> >already got enqueued (maybe through earlier cleanup jobs or through
>> >drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs
>> >cancel_delayed_work(&sched->work_tdr) will return true even while there is a
>> >timeout job in progress.
>> >Unfortunately we cannot change cancel_delayed_work to
>> >cancel_delayed_work_sync to flush the timeout work as timeout work itself
>> >waits for schedule thread  to be parked again when calling park_thread.
>> >
>> >Andrey
>> >
>> >________________________________________
>> >From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of
>> >Koenig, Christian <Christian.Koenig@amd.com>
>> >Sent: 08 November 2019 05:35:18
>> >To: Deng, Emily; amd-gfx@lists.freedesktop.org
>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>> >
>> >Hi Emily,
>> >
>> >exactly that can't happen. See here:
>> >
>> >>         /* Don't destroy jobs while the timeout worker is running */
>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>> >>            !cancel_delayed_work(&sched->work_tdr))
>> >>                 return NULL;
>> >
>> >We never free jobs while the timeout working is running to prevent exactly
>> >that issue.
>> >
>> >Regards,
>> >Christian.
>> >
>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>> >> Hi Christian,
>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>> >amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>> >>
>> >> Best wishes
>> >> Emily Deng
>> >>
>> >>
>> >>
>> >>> -----Original Message-----
>> >>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> >>> Sent: Friday, November 8, 2019 6:26 PM
>> >>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>> >>>
>> >>> Hi Emily,
>> >>>
>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>> >>>
>> >>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>> >>>
>> >>> Regards,
>> >>> Christian.
>> >>>
>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>> >>>> Hi Chrisitan,
>> >>>>        No, I am with the new branch and also has the patch. Even it
>> >>>> are freed by
>> >>> main scheduler, how we could avoid main scheduler to free jobs while
>> >>> enter to function amdgpu_device_gpu_recover?
>> >>>> Best wishes
>> >>>> Emily Deng
>> >>>>
>> >>>>
>> >>>>
>> >>>>> -----Original Message-----
>> >>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>> >>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>> >gfx@lists.freedesktop.org
>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>> >>>>>
>> >>>>> Hi Emily,
>> >>>>>
>> >>>>> in this case you are on an old code branch.
>> >>>>>
>> >>>>> Jobs are freed now by the main scheduler thread and only if no
>> >>>>> timeout handler is running.
>> >>>>>
>> >>>>> See this patch here:
>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>> >>>>>> Author: Christian König <christian.koenig@amd.com>
>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>> >>>>>>
>> >>>>>>       drm/scheduler: rework job destruction
>> >>>>> Regards,
>> >>>>> Christian.
>> >>>>>
>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>> >>>>>> Hi Christian,
>> >>>>>>         Please refer to follow log, when it enter to
>> >>>>>> amdgpu_device_gpu_recover
>> >>>>> function, the bad job 000000005086879e is freeing in function
>> >>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>> >>> signal.
>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>> >>>>> s_fence is already freed, but job is not freed in time. Then this issue
>> >occurs.
>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>> >>> sdma0
>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>> >>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>> >>>>> amdgpu
>> >>>>> 0000:00:08.0: GPU reset begin!
>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>> >>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>> >>>>>> thread pid 0,
>> >>>>> s_job:00000000ea85e922 [ 449.794287]
>> >>>>> Emily:amdgpu_job_free_cb,Process
>> >>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>> >>>>> at
>> >>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>> >>>>> 0000 [#1] SMP PTI
>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>> >>>>>> 449.803488]
>> >>> RIP:
>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>> >>>>>> ff
>> >>>>>> 45 85 e4 0f
>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>> >>>>> <48> 8b
>> >>> 98
>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0 R14:
>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>> >>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000 ES: 0000
>> >CR0:
>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0 CR3:
>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>> >>>>> 0000000000000400 [  449.811937] Call Trace:
>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>> >>>>>> worker_thread+0x34/0x410 [ 449.814728]  kthread+0x121/0x140 [
>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>> >>>>>>
>> >>>>>>> -----Original Message-----
>> >>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>> >>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>> >>> gfx@lists.freedesktop.org
>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>> >>>>>>> tdr
>> >>>>>>>
>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>> >>>>>>>> Sorry, please take your time.
>> >>>>>>> Have you seen my other response a bit below?
>> >>>>>>>
>> >>>>>>> I can't follow how it would be possible for job->s_fence to be
>> >>>>>>> NULL without the job also being freed.
>> >>>>>>>
>> >>>>>>> So it looks like this patch is just papering over some bigger issues.
>> >>>>>>>
>> >>>>>>> Regards,
>> >>>>>>> Christian.
>> >>>>>>>
>> >>>>>>>> Best wishes
>> >>>>>>>> Emily Deng
>> >>>>>>>>
>> >>>>>>>>
>> >>>>>>>>
>> >>>>>>>>> -----Original Message-----
>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>> >>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>> >>>>> gfx@lists.freedesktop.org
>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>> >>>>>>>>> tdr
>> >>>>>>>>>
>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>> >>>>>>>>>> Ping.....
>> >>>>>>>>> You need to give me at least enough time to wake up :)
>> >>>>>>>>>
>> >>>>>>>>>> Best wishes
>> >>>>>>>>>> Emily Deng
>> >>>>>>>>>>
>> >>>>>>>>>>
>> >>>>>>>>>>
>> >>>>>>>>>>> -----Original Message-----
>> >>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>> >>> Behalf
>> >>>>>>>>>>> Of Deng, Emily
>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>> >>>>>>>>>>> gfx@lists.freedesktop.org
>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>> >>>>>>>>>>> for tdr
>> >>>>>>>>>>>
>> >>>>>>>>>>>> -----Original Message-----
>> >>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>> >>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>> >>>>>>>>>>>> for tdr
>> >>>>>>>>>>>>
>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>> >>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>> >>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>> >>>>>>>>>>> go to free
>> >>>>> job.
>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>> >>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>> >>>>>>>>> No, that case can't happen. See here:
>> >>>>>>>>>
>> >>>>>>>>>>            drm_sched_job_cleanup(s_job);
>> >>>>>>>>>>
>> >>>>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
>> >>>>>>>>>>            dma_fence_put(job->fence);
>> >>>>>>>>>>            amdgpu_sync_free(&job->sync);
>> >>>>>>>>>>            amdgpu_sync_free(&job->sched_sync);
>> >>>>>>>>>>            kfree(job);
>> >>>>>>>>> The job itself is freed up directly after freeing the reference
>> >>>>>>>>> to the
>> >>>>> s_fence.
>> >>>>>>>>> So you are just papering over a much bigger problem here. This
>> >>>>>>>>> patch is a clear NAK.
>> >>>>>>>>>
>> >>>>>>>>> Regards,
>> >>>>>>>>> Christian.
>> >>>>>>>>>
>> >>>>>>>>>>>> When you see a job without an s_fence then that means the
>> >>>>>>>>>>>> problem is somewhere else.
>> >>>>>>>>>>>>
>> >>>>>>>>>>>> Regards,
>> >>>>>>>>>>>> Christian.
>> >>>>>>>>>>>>
>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>> >>>>>>>>>>>>> ---
>> >>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>> >>>>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>> >--
>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>> >>> amdgpu_device_gpu_recover(struct
>> >>>>>>>>>>>> amdgpu_device *adev,
>> >>>>>>>>>>>>>            *
>> >>>>>>>>>>>>>            * job->base holds a reference to parent fence
>> >>>>>>>>>>>>>            */
>> >>>>>>>>>>>>> - if (job && job->base.s_fence->parent &&
>> >>>>>>>>>>>>> + if (job && job->base.s_fence &&
>> >>>>>>>>>>>>> + job->base.s_fence->parent
>> >>>>>>> &&
>> >>>>>>>>>>>>>               dma_fence_is_signaled(job->base.s_fence->parent))
>> >>>>>>>>>>>>>                   job_signaled = true;
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>> >drm_sched_increase_karma(struct
>> >>>>>>>>>>>> drm_sched_job
>> >>>>>>>>>>>>> *bad)
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>>                           spin_lock(&rq->lock);
>> >>>>>>>>>>>>>                           list_for_each_entry_safe(entity,
>> >>>>>>>>>>>>> tmp,
>> >>> &rq-
>> >>>>>>>> entities,
>> >>>>>>>>>>>> list) {
>> >>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
>> >>>>>>> ==
>> >>>>>>>>>>>>> -                              entity->fence_context) {
>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>> >>>>>>>>>>>>> + (bad->s_fence-
>> >>>>>>>>>>>>> scheduled.context ==
>> >>>>>>>>>>>>> +                              entity->fence_context)) {
>> >>>>>>>>>>>>>                                           if
>> >>>>>>>>>>>>> (atomic_read(&bad-
>> >>>>>>>> karma) >
>> >>>>>>>>>>>>>                                               bad->sched-
>> >>>> hang_limit)
>> >>>>>>>>>>>>>                                                   if
>> >>>>>>>>>>>>> (entity-
>> >>>> guilty) @@ -376,7 +376,7 @@ void
>> >>>>>>>>>>>>> drm_sched_stop(struct
>> >>>>>>> drm_gpu_scheduler
>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>> >>>>>>>>>>>>>            * This iteration is thread safe as sched thread
>> >>>>>>>>>>>>> is
>> >>> stopped.
>> >>>>>>>>>>>>>            */
>> >>>>>>>>>>>>>           list_for_each_entry_safe_reverse(s_job, tmp,
>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>> >>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
>> >>>>>>>>>>>>>                       dma_fence_remove_callback(s_job-
>> >>>> s_fence-
>> >>>>>>>> parent,
>> >>>>>>>>>>>>>                                                 &s_job->cb)) {
>> >>>>>>>>>>>>>                           atomic_dec(&sched->hw_rq_count);
>> >>> @@ -
>> >>>>>>> 395,7
>> >>>>>>>>>>> +395,8 @@ void
>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>> >>>>>>>>>>>>>                            *
>> >>>>>>>>>>>>>                            * Job is still alive so fence
>> >>>>>>>>>>>>> refcount at
>> >>> least 1
>> >>>>>>>>>>>>>                            */
>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>> >>>>>>> false);
>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>> >>>>>>>> finished,
>> >>>>>>>>>>>> false);
>> >>>>>>>>>>>>>                           /*
>> >>>>>>>>>>>>>                            * We must keep bad job alive
>> >>>>>>>>>>>>> for later
>> >>> use
>> >>>>>>> during @@
>> >>>>>>>>>>>> -438,7
>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>> >>>>> *sched,
>> >>>>>>>>>>>>> +bool
>> >>>>>>>>>>>> full_recovery)
>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>> >>>>>>>>>>>>>            */
>> >>>>>>>>>>>>>           list_for_each_entry_safe(s_job, tmp,
>> >>>>>>>>>>>>> &sched->ring_mirror_list,
>> >>>>>>>>>>>>> node)
>> >>>>>>>>>>>> {
>> >>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
>> >>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ?
>> >>>>>>>>>>>>> + s_job-
>> >>>>>>>> s_fence-
>> >>>>>>>>>>>>> parent :
>> >>>>>>>>>>>>> +NULL;
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>>                   atomic_inc(&sched->hw_rq_count);
>> >>>>>>>>>>>>>
>> >>>>>>>>>>> _______________________________________________
>> >>>>>>>>>>> amd-gfx mailing list
>> >>>>>>>>>>> amd-gfx@lists.freedesktop.org
>> >>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>> >
>> >_______________________________________________
>> >amd-gfx mailing list
>> >amd-gfx@lists.freedesktop.org
>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>

[-- Attachment #1.2: Type: text/html, Size: 55098 bytes --]

[-- Attachment #2: basic_fix.patch --]
[-- Type: text/x-patch, Size: 647 bytes --]

@@ -647,14 +653,28 @@ static void drm_sched_cleanup_jobs(struct drm_gpu_scheduler *sched)
 {
        unsigned long flags;
 
+
        /*
         * Don't destroy jobs while the timeout worker is running  OR thread
         * is being parked and hence assumed to not touch ring_mirror_list
         */

        if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
-           !cancel_delayed_work(&sched->work_tdr)) ||
-           __kthread_should_park(sched->thread)) {
+               return;
+
+       }
+
+       if (work_busy(&sched->work_tdr.work)) {
+               DRM_ERROR("DRM_ERROR work_busy - returning");
                return;
+       }


[-- Attachment #3: simulate_crash.patch --]
[-- Type: text/x-patch, Size: 1473 bytes --]

@@ -647,14 +653,28 @@ static void drm_sched_cleanup_jobs(struct drm_gpu_scheduler *sched)
 {
        unsigned long flags;
 
+
        /*
         * Don't destroy jobs while the timeout worker is running  OR thread
         * is being parked and hence assumed to not touch ring_mirror_list
         */
+       DRM_ERROR("Sched name %s", sched->name);
+       if (!strcmp("gfx", sched->name)) {
+               spin_lock_irqsave(&sched->job_list_lock, flags);
+               drm_sched_start_timeout(sched);
+               spin_unlock_irqrestore(&sched->job_list_lock, flags);
+       }
        if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
-           !cancel_delayed_work(&sched->work_tdr)) ||
-           __kthread_should_park(sched->thread))
+           !cancel_delayed_work(&sched->work_tdr))) {
+               DRM_ERROR("DRM_ERROR cancel_delayed_work false - returning");
+               return;
+
+       }




--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4007,6 +4007,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                device_list_handle = &device_list;
        }
 
+       DRM_ERROR("Waiting for bad job to finish %p", &job->base);
+       dma_fence_wait(&job->base.s_fence->finished, false);
        /* block all schedulers and reset given job's ring */
        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
                if (tmp_adev != adev) {









[-- Attachment #4: crash.log --]
[-- Type: text/x-log, Size: 5705 bytes --]

[   32.270341 <    0.001165>] #PF: supervisor read access in kernel mode
[   32.270934 <    0.000593>] #PF: error_code(0x0000) - not-present page
[   32.271520 <    0.000586>] PGD 8000000460346067 P4D 8000000460346067 PUD 460345067 PMD 0 
[   32.272108 <    0.000588>] Oops: 0000 [#1] SMP PTI
[   32.272695 <    0.000587>] CPU: 5 PID: 99 Comm: kworker/5:1 Tainted: G           OE     5.3.0-rc3-test-kfd+ #8
[   32.273288 <    0.000593>] Hardware name: System manufacturer System Product Name/Z170-PRO, BIOS 1902 06/27/2016
[   32.273881 <    0.000593>] Workqueue: events drm_sched_job_timedout [gpu_sched]
[   32.274505 <    0.000624>] RIP: 0010:amdgpu_device_gpu_recover+0x319/0xc20 [amdgpu]
[   32.275101 <    0.000596>] Code: 30 c6 44 24 2f 00 48 c7 c2 08 a4 66 c0 48 89 6c 24 30 48 89 6c 24 38 e9 60 fd ff ff 48 8b 44 24 10 48 85 c0 74 56 48 8b 40 10 <48> 8b 98 80 00 00 00 48 85 db 74 46 48 8b 43 30 a8 01 75 1e 48 8b
[   32.276401 <    0.001300>] RSP: 0018:ffffa6940036bd40 EFLAGS: 00010286
[   32.277055 <    0.000654>] RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffa69400369960
[   32.277712 <    0.000657>] RDX: 0000000000000000 RSI: ffff8a1b23a620b0 RDI: ffff8a1b23a61800
[   32.278357 <    0.000645>] RBP: ffffa6940036bd70 R08: 0000000000000000 R09: 0000000000000001
[   32.278999 <    0.000642>] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8a1b1fa266e8
[   32.279636 <    0.000637>] R13: 0000000000000000 R14: ffff8a1b20140c00 R15: ffff8a1b1fa266e8
[   32.280276 <    0.000640>] FS:  0000000000000000(0000) GS:ffff8a1b26340000(0000) knlGS:0000000000000000
[   32.280925 <    0.000649>] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   32.281567 <    0.000642>] CR2: 0000000000000080 CR3: 0000000460f50006 CR4: 00000000003606e0
[   32.282211 <    0.000644>] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   32.282860 <    0.000649>] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   32.283503 <    0.000643>] Call Trace:
[   32.284182 <    0.000679>]  amdgpu_job_timedout+0x123/0x140 [amdgpu]
[   32.284819 <    0.000637>]  ? drm_sched_job_timedout+0x4d/0xb0 [gpu_sched]
[   32.285451 <    0.000632>]  drm_sched_job_timedout+0x4d/0xb0 [gpu_sched]
[   32.286086 <    0.000635>]  process_one_work+0x1f1/0x600
[   32.286723 <    0.000637>]  worker_thread+0x4c/0x430
[   32.287361 <    0.000638>]  ? process_one_work+0x600/0x600
[   32.287993 <    0.000632>]  kthread+0x101/0x140
[   32.288616 <    0.000623>]  ? kthread_cancel_delayed_work_sync+0x10/0x10
[   32.289250 <    0.000634>]  ret_from_fork+0x24/0x30
[   32.289886 <    0.000636>] Modules linked in: amdgpu(OE) amd_iommu_v2 gpu_sched(OE) ttm(OE) x86_pkg_temp_thermal video acpi_pad
[   32.290549 <    0.000663>] CR2: 0000000000000080
[   32.291212 <    0.000663>] ---[ end trace 8a8ed840f9d52369 ]---
[   32.291902 <    0.000690>] RIP: 0010:amdgpu_device_gpu_recover+0x319/0xc20 [amdgpu]
[   32.292582 <    0.000680>] Code: 30 c6 44 24 2f 00 48 c7 c2 08 a4 66 c0 48 89 6c 24 30 48 89 6c 24 38 e9 60 fd ff ff 48 8b 44 24 10 48 85 c0 74 56 48 8b 40 10 <48> 8b 98 80 00 00 00 48 85 db 74 46 48 8b 43 30 a8 01 75 1e 48 8b
[   32.294026 <    0.001444>] RSP: 0018:ffffa6940036bd40 EFLAGS: 00010286
[   32.294751 <    0.000725>] RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffa69400369960
[   32.295478 <    0.000727>] RDX: 0000000000000000 RSI: ffff8a1b23a620b0 RDI: ffff8a1b23a61800
[   32.296200 <    0.000722>] RBP: ffffa6940036bd70 R08: 0000000000000000 R09: 0000000000000001
[   32.296920 <    0.000720>] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8a1b1fa266e8
[   32.297634 <    0.000714>] R13: 0000000000000000 R14: ffff8a1b20140c00 R15: ffff8a1b1fa266e8
[   32.298341 <    0.000707>] FS:  0000000000000000(0000) GS:ffff8a1b26340000(0000) knlGS:0000000000000000
[   32.299047 <    0.000706>] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   32.299749 <    0.000702>] CR2: 0000000000000080 CR3: 0000000460f50006 CR4: 00000000003606e0
[   32.300452 <    0.000703>] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   32.301151 <    0.000699>] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   32.301845 <    0.000694>] BUG: sleeping function called from invalid context at ./include/linux/percpu-rwsem.h:38
[   32.302543 <    0.000698>] in_atomic(): 0, irqs_disabled(): 1, pid: 99, name: kworker/5:1
[   32.303239 <    0.000696>] INFO: lockdep is turned off.
[   32.303928 <    0.000689>] irq event stamp: 121070
[   32.304610 <    0.000682>] hardirqs last  enabled at (121069): [<ffffffff8409a8f9>] cancel_delayed_work+0x69/0xa0
[   32.305303 <    0.000693>] hardirqs last disabled at (121070): [<ffffffff84001c5a>] trace_hardirqs_off_thunk+0x1a/0x20
[   32.305994 <    0.000691>] softirqs last  enabled at (120778): [<ffffffff8500036a>] __do_softirq+0x36a/0x425
[   32.306678 <    0.000684>] softirqs last disabled at (120771): [<ffffffff8407c943>] irq_exit+0xb3/0xc0
[   32.307358 <    0.000680>] CPU: 5 PID: 99 Comm: kworker/5:1 Tainted: G      D    OE     5.3.0-rc3-test-kfd+ #8
[   32.308044 <    0.000686>] Hardware name: System manufacturer System Product Name/Z170-PRO, BIOS 1902 06/27/2016
[   32.308734 <    0.000690>] Workqueue: events drm_sched_job_timedout [gpu_sched]
[   32.309419 <    0.000685>] Call Trace:
[   32.310097 <    0.000678>]  dump_stack+0x5e/0x8b
[   32.310767 <    0.000670>]  ___might_sleep+0x20c/0x240
[   32.311429 <    0.000662>]  exit_signals+0x30/0x340
[   32.312086 <    0.000657>]  do_exit+0xc0/0xc80
[   32.312710 <    0.000624>]  ? process_one_work+0x600/0x600
[   32.313300 <    0.000590>]  ? kthread+0x101/0x140
[   32.313884 <    0.000584>]  rewind_stack_do_exit+0x17/0x20


[-- Attachment #5: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-13  7:36                                                                     ` Christian König
  0 siblings, 0 replies; 80+ messages in thread
From: Christian König @ 2019-11-13  7:36 UTC (permalink / raw)
  To: Andrey Grodzovsky, Christian König, Deng, Emily,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 24491 bytes --]

The question is where do we rearm the timer for this problem to occur?

Regards,
Christian.

Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>
> I was able to reproduce the crash by using the attached 
> simulate_crash.patch - waiting on guilty job to signal in reset work 
> and artificially rearming the timeout timer just before the check for 
> !cancel_delayed_work(&sched->work_tdr)  in drm_sched_cleanup_jobs - 
> crash log attached in crash.log. This I think confirms my theory i 
> described earlier in this thread.
>
> basic_fix.patch handles this by testing whether another timer already 
> armed ob this scheduler or is there a timeout work in execution right 
> now (see documentation for work_busy) - obviously  this is not a full 
> solution as this will not protect from races if for example there is 
> immediate work scheduling such as in drm_sched_fault -  so we probably 
> need to account for this by making drm_sched_cleanup_jobs (at least in 
> the part where it iterates ring mirror list and frees jobs) and GPU 
> reset really mutually exclusive and not like now.
>
> Andrey
>
>
> On 11/11/19 4:11 PM, Christian König wrote:
>> Hi Emily,
>>
>> you need to print which scheduler instance is freeing the jobs and 
>> which one is triggering the reset. The TID and PID is completely 
>> meaningless here since we are called from different worker threads 
>> and the TID/PID can change on each call.
>>
>> Apart from that I will look into this a bit deeper when I have time.
>>
>> Regards,
>> Christian.
>>
>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>> Hi Christian,
>>>     I add the follow print in function drm_sched_cleanup_jobs. From 
>>> the log it shows that only use cancel_delayed_work could not avoid 
>>> to free job when the sched is in reset. But don’t know exactly where 
>>> it is wrong about the driver. Do you have any suggestion about this?
>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, pid:%lu\n", 
>>> current->tgid, current->pid);
>>>         /*
>>>          * Don't destroy jobs while the timeout worker is running  
>>> OR thread
>>>          * is being parked and hence assumed to not touch 
>>> ring_mirror_list
>>>          */
>>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>> !cancel_delayed_work(&sched->work_tdr)))
>>>                 return;
>>> +       printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
>>> current->tgid, current->pid);
>>> Best wishes
>>> Emily Deng
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0 
>>> timeout, signaled seq=78585, emitted seq=78587
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
>>> information: process  pid 0 thread  pid 0, 
>>> s_job:00000000fe75ab36,tid=15603, pid=15603
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information: 
>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, pid:2253
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information: 
>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, pid:2262
>>> >-----Original Message-----
>>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
>>> >Sent: Tuesday, November 12, 2019 11:28 AM
>>> >To: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>; Deng, Emily
>>> ><Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>> >
>>> >Thinking more about this claim - we assume here that if cancel_delayed_work
>>> >returned true it guarantees that timeout work is not running but, it merely
>>> >means there was a pending timeout work which was removed from the
>>> >workqueue before it's timer elapsed and so it didn't have a chance to be
>>> >dequeued and executed, it doesn't cover already executing work. So there is a
>>> >possibility where while timeout work started executing another timeout work
>>> >already got enqueued (maybe through earlier cleanup jobs or through
>>> >drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs
>>> >cancel_delayed_work(&sched->work_tdr) will return true even while there is a
>>> >timeout job in progress.
>>> >Unfortunately we cannot change cancel_delayed_work to
>>> >cancel_delayed_work_sync to flush the timeout work as timeout work itself
>>> >waits for schedule thread  to be parked again when calling park_thread.
>>> >
>>> >Andrey
>>> >
>>> >________________________________________
>>> >From: amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> on behalf of
>>> >Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>> >Sent: 08 November 2019 05:35:18
>>> >To: Deng, Emily; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>> >
>>> >Hi Emily,
>>> >
>>> >exactly that can't happen. See here:
>>> >
>>> >>         /* Don't destroy jobs while the timeout worker is running */
>>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>> >>            !cancel_delayed_work(&sched->work_tdr))
>>> >>                 return NULL;
>>> >
>>> >We never free jobs while the timeout working is running to prevent exactly
>>> >that issue.
>>> >
>>> >Regards,
>>> >Christian.
>>> >
>>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>> >> Hi Christian,
>>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>>> >amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>>> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>> >>
>>> >> Best wishes
>>> >> Emily Deng
>>> >>
>>> >>
>>> >>
>>> >>> -----Original Message-----
>>> >>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>> >>> Sent: Friday, November 8, 2019 6:26 PM
>>> >>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>> >>>
>>> >>> Hi Emily,
>>> >>>
>>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>>> >>>
>>> >>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>> >>>
>>> >>> Regards,
>>> >>> Christian.
>>> >>>
>>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>> >>>> Hi Chrisitan,
>>> >>>>        No, I am with the new branch and also has the patch. Even it
>>> >>>> are freed by
>>> >>> main scheduler, how we could avoid main scheduler to free jobs while
>>> >>> enter to function amdgpu_device_gpu_recover?
>>> >>>> Best wishes
>>> >>>> Emily Deng
>>> >>>>
>>> >>>>
>>> >>>>
>>> >>>>> -----Original Message-----
>>> >>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>>> >>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>>> >gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>> >>>>>
>>> >>>>> Hi Emily,
>>> >>>>>
>>> >>>>> in this case you are on an old code branch.
>>> >>>>>
>>> >>>>> Jobs are freed now by the main scheduler thread and only if no
>>> >>>>> timeout handler is running.
>>> >>>>>
>>> >>>>> See this patch here:
>>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>> >>>>>> Author: Christian König <christian.koenig-5C7GfCeVMHo@public.gmane.org>
>>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>> >>>>>>
>>> >>>>>>       drm/scheduler: rework job destruction
>>> >>>>> Regards,
>>> >>>>> Christian.
>>> >>>>>
>>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>> >>>>>> Hi Christian,
>>> >>>>>>         Please refer to follow log, when it enter to
>>> >>>>>> amdgpu_device_gpu_recover
>>> >>>>> function, the bad job 000000005086879e is freeing in function
>>> >>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>> >>> signal.
>>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>> >>>>> s_fence is already freed, but job is not freed in time. Then this issue
>>> >occurs.
>>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>> >>> sdma0
>>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>> >>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>> >>>>> amdgpu
>>> >>>>> 0000:00:08.0: GPU reset begin!
>>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>> >>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process pid 0
>>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process pid 0
>>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process pid 0
>>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process pid 0
>>> >>>>>> thread pid 0,
>>> >>>>> s_job:00000000ea85e922 [ 449.794287]
>>> >>>>> Emily:amdgpu_job_free_cb,Process
>>> >>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>>> >>>>> at
>>> >>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>> >>>>> 0000 [#1] SMP PTI
>>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>> >>>>>> 449.803488]
>>> >>> RIP:
>>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>> >>>>>> ff
>>> >>>>>> 45 85 e4 0f
>>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>> >>>>> <48> 8b
>>> >>> 98
>>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0 R14:
>>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>> >>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000 ES: 0000
>>> >CR0:
>>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0 CR3:
>>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>> >>>>> 0000000000000400 [  449.811937] Call Trace:
>>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>> >>>>>> worker_thread+0x34/0x410 [ 449.814728]  kthread+0x121/0x140 [
>>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>> >>>>>>
>>> >>>>>>> -----Original Message-----
>>> >>>>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>> >>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>>> >>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>> >>>>>>> tdr
>>> >>>>>>>
>>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>> >>>>>>>> Sorry, please take your time.
>>> >>>>>>> Have you seen my other response a bit below?
>>> >>>>>>>
>>> >>>>>>> I can't follow how it would be possible for job->s_fence to be
>>> >>>>>>> NULL without the job also being freed.
>>> >>>>>>>
>>> >>>>>>> So it looks like this patch is just papering over some bigger issues.
>>> >>>>>>>
>>> >>>>>>> Regards,
>>> >>>>>>> Christian.
>>> >>>>>>>
>>> >>>>>>>> Best wishes
>>> >>>>>>>> Emily Deng
>>> >>>>>>>>
>>> >>>>>>>>
>>> >>>>>>>>
>>> >>>>>>>>> -----Original Message-----
>>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>> >>>>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>>> >>>>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>> >>>>>>>>> tdr
>>> >>>>>>>>>
>>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>> >>>>>>>>>> Ping.....
>>> >>>>>>>>> You need to give me at least enough time to wake up :)
>>> >>>>>>>>>
>>> >>>>>>>>>> Best wishes
>>> >>>>>>>>>> Emily Deng
>>> >>>>>>>>>>
>>> >>>>>>>>>>
>>> >>>>>>>>>>
>>> >>>>>>>>>>> -----Original Message-----
>>> >>>>>>>>>>> From: amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> On
>>> >>> Behalf
>>> >>>>>>>>>>> Of Deng, Emily
>>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>; amd-
>>> >>>>>>>>>>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>> >>>>>>>>>>> for tdr
>>> >>>>>>>>>>>
>>> >>>>>>>>>>>> -----Original Message-----
>>> >>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>;
>>> >>>>>>>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>> >>>>>>>>>>>> for tdr
>>> >>>>>>>>>>>>
>>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>> >>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>> >>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>>> >>>>>>>>>>> go to free
>>> >>>>> job.
>>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>> >>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>> >>>>>>>>> No, that case can't happen. See here:
>>> >>>>>>>>>
>>> >>>>>>>>>>            drm_sched_job_cleanup(s_job);
>>> >>>>>>>>>>
>>> >>>>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
>>> >>>>>>>>>>            dma_fence_put(job->fence);
>>> >>>>>>>>>>            amdgpu_sync_free(&job->sync);
>>> >>>>>>>>>>            amdgpu_sync_free(&job->sched_sync);
>>> >>>>>>>>>>            kfree(job);
>>> >>>>>>>>> The job itself is freed up directly after freeing the reference
>>> >>>>>>>>> to the
>>> >>>>> s_fence.
>>> >>>>>>>>> So you are just papering over a much bigger problem here. This
>>> >>>>>>>>> patch is a clear NAK.
>>> >>>>>>>>>
>>> >>>>>>>>> Regards,
>>> >>>>>>>>> Christian.
>>> >>>>>>>>>
>>> >>>>>>>>>>>> When you see a job without an s_fence then that means the
>>> >>>>>>>>>>>> problem is somewhere else.
>>> >>>>>>>>>>>>
>>> >>>>>>>>>>>> Regards,
>>> >>>>>>>>>>>> Christian.
>>> >>>>>>>>>>>>
>>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng-5C7GfCeVMHo@public.gmane.org>
>>> >>>>>>>>>>>>> ---
>>> >>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>> >>>>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>>> >--
>>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
>>> >>>>>>>>>>>>>
>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>> >>> amdgpu_device_gpu_recover(struct
>>> >>>>>>>>>>>> amdgpu_device *adev,
>>> >>>>>>>>>>>>>            *
>>> >>>>>>>>>>>>>            * job->base holds a reference to parent fence
>>> >>>>>>>>>>>>>            */
>>> >>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>> >>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>> >>>>>>>>>>>>> + job->base.s_fence->parent
>>> >>>>>>> &&
>>> >>>>>>>>>>>>>               dma_fence_is_signaled(job->base.s_fence->parent))
>>> >>>>>>>>>>>>>                   job_signaled = true;
>>> >>>>>>>>>>>>>
>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>> >drm_sched_increase_karma(struct
>>> >>>>>>>>>>>> drm_sched_job
>>> >>>>>>>>>>>>> *bad)
>>> >>>>>>>>>>>>>
>>> >>>>>>>>>>>>>                           spin_lock(&rq->lock);
>>> >>>>>>>>>>>>>                           list_for_each_entry_safe(entity,
>>> >>>>>>>>>>>>> tmp,
>>> >>> &rq-
>>> >>>>>>>> entities,
>>> >>>>>>>>>>>> list) {
>>> >>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
>>> >>>>>>> ==
>>> >>>>>>>>>>>>> -                              entity->fence_context) {
>>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>>> >>>>>>>>>>>>> + (bad->s_fence-
>>> >>>>>>>>>>>>> scheduled.context ==
>>> >>>>>>>>>>>>> + entity->fence_context)) {
>>> >>>>>>>>>>>>>                                           if
>>> >>>>>>>>>>>>> (atomic_read(&bad-
>>> >>>>>>>> karma) >
>>> >>>>>>>>>>>>>                                               bad->sched-
>>> >>>> hang_limit)
>>> >>>>>>>>>>>>>                                                   if
>>> >>>>>>>>>>>>> (entity-
>>> >>>> guilty) @@ -376,7 +376,7 @@ void
>>> >>>>>>>>>>>>> drm_sched_stop(struct
>>> >>>>>>> drm_gpu_scheduler
>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>> >>>>>>>>>>>>>            * This iteration is thread safe as sched thread
>>> >>>>>>>>>>>>> is
>>> >>> stopped.
>>> >>>>>>>>>>>>>            */
>>> >>>>>>>>>>>>>           list_for_each_entry_safe_reverse(s_job, tmp,
>>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>> >>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
>>> >>>>>>>>>>>>>                       dma_fence_remove_callback(s_job-
>>> >>>> s_fence-
>>> >>>>>>>> parent,
>>> >>>>>>>>>>>>>                                                 &s_job->cb)) {
>>> >>>>>>>>>>>>>                           atomic_dec(&sched->hw_rq_count);
>>> >>> @@ -
>>> >>>>>>> 395,7
>>> >>>>>>>>>>> +395,8 @@ void
>>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>> >>>>>>>>>>>>>                            *
>>> >>>>>>>>>>>>>                            * Job is still alive so fence
>>> >>>>>>>>>>>>> refcount at
>>> >>> least 1
>>> >>>>>>>>>>>>>                            */
>>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>> >>>>>>> false);
>>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>> >>>>>>>> finished,
>>> >>>>>>>>>>>> false);
>>> >>>>>>>>>>>>>                           /*
>>> >>>>>>>>>>>>>                            * We must keep bad job alive
>>> >>>>>>>>>>>>> for later
>>> >>> use
>>> >>>>>>> during @@
>>> >>>>>>>>>>>> -438,7
>>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>> >>>>> *sched,
>>> >>>>>>>>>>>>> +bool
>>> >>>>>>>>>>>> full_recovery)
>>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>>> >>>>>>>>>>>>>            */
>>> >>>>>>>>>>>>>           list_for_each_entry_safe(s_job, tmp,
>>> >>>>>>>>>>>>> &sched->ring_mirror_list,
>>> >>>>>>>>>>>>> node)
>>> >>>>>>>>>>>> {
>>> >>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
>>> >>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ?
>>> >>>>>>>>>>>>> + s_job-
>>> >>>>>>>> s_fence-
>>> >>>>>>>>>>>>> parent :
>>> >>>>>>>>>>>>> +NULL;
>>> >>>>>>>>>>>>>
>>> >>>>>>>>>>>>>                   atomic_inc(&sched->hw_rq_count);
>>> >>>>>>>>>>>>>
>>> >>>>>>>>>>> _______________________________________________
>>> >>>>>>>>>>> amd-gfx mailing list
>>> >>>>>>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>> >>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>> >
>>> >_______________________________________________
>>> >amd-gfx mailing list
>>> >amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[-- Attachment #1.2: Type: text/html, Size: 55980 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-13  7:36                                                                     ` Christian König
  0 siblings, 0 replies; 80+ messages in thread
From: Christian König @ 2019-11-13  7:36 UTC (permalink / raw)
  To: Andrey Grodzovsky, Christian König, Deng, Emily, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 23761 bytes --]

The question is where do we rearm the timer for this problem to occur?

Regards,
Christian.

Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>
> I was able to reproduce the crash by using the attached 
> simulate_crash.patch - waiting on guilty job to signal in reset work 
> and artificially rearming the timeout timer just before the check for 
> !cancel_delayed_work(&sched->work_tdr)  in drm_sched_cleanup_jobs - 
> crash log attached in crash.log. This I think confirms my theory i 
> described earlier in this thread.
>
> basic_fix.patch handles this by testing whether another timer already 
> armed ob this scheduler or is there a timeout work in execution right 
> now (see documentation for work_busy) - obviously  this is not a full 
> solution as this will not protect from races if for example there is 
> immediate work scheduling such as in drm_sched_fault -  so we probably 
> need to account for this by making drm_sched_cleanup_jobs (at least in 
> the part where it iterates ring mirror list and frees jobs) and GPU 
> reset really mutually exclusive and not like now.
>
> Andrey
>
>
> On 11/11/19 4:11 PM, Christian König wrote:
>> Hi Emily,
>>
>> you need to print which scheduler instance is freeing the jobs and 
>> which one is triggering the reset. The TID and PID is completely 
>> meaningless here since we are called from different worker threads 
>> and the TID/PID can change on each call.
>>
>> Apart from that I will look into this a bit deeper when I have time.
>>
>> Regards,
>> Christian.
>>
>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>> Hi Christian,
>>>     I add the follow print in function drm_sched_cleanup_jobs. From 
>>> the log it shows that only use cancel_delayed_work could not avoid 
>>> to free job when the sched is in reset. But don’t know exactly where 
>>> it is wrong about the driver. Do you have any suggestion about this?
>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, pid:%lu\n", 
>>> current->tgid, current->pid);
>>>         /*
>>>          * Don't destroy jobs while the timeout worker is running  
>>> OR thread
>>>          * is being parked and hence assumed to not touch 
>>> ring_mirror_list
>>>          */
>>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>> !cancel_delayed_work(&sched->work_tdr)))
>>>                 return;
>>> +       printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
>>> current->tgid, current->pid);
>>> Best wishes
>>> Emily Deng
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0 
>>> timeout, signaled seq=78585, emitted seq=78587
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
>>> information: process  pid 0 thread  pid 0, 
>>> s_job:00000000fe75ab36,tid=15603, pid=15603
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information: 
>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, pid:2253
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information: 
>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, pid:2262
>>> >-----Original Message-----
>>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>> >Sent: Tuesday, November 12, 2019 11:28 AM
>>> >To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>> ><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>> >
>>> >Thinking more about this claim - we assume here that if cancel_delayed_work
>>> >returned true it guarantees that timeout work is not running but, it merely
>>> >means there was a pending timeout work which was removed from the
>>> >workqueue before it's timer elapsed and so it didn't have a chance to be
>>> >dequeued and executed, it doesn't cover already executing work. So there is a
>>> >possibility where while timeout work started executing another timeout work
>>> >already got enqueued (maybe through earlier cleanup jobs or through
>>> >drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs
>>> >cancel_delayed_work(&sched->work_tdr) will return true even while there is a
>>> >timeout job in progress.
>>> >Unfortunately we cannot change cancel_delayed_work to
>>> >cancel_delayed_work_sync to flush the timeout work as timeout work itself
>>> >waits for schedule thread  to be parked again when calling park_thread.
>>> >
>>> >Andrey
>>> >
>>> >________________________________________
>>> >From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of
>>> >Koenig, Christian <Christian.Koenig@amd.com>
>>> >Sent: 08 November 2019 05:35:18
>>> >To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>> >
>>> >Hi Emily,
>>> >
>>> >exactly that can't happen. See here:
>>> >
>>> >>         /* Don't destroy jobs while the timeout worker is running */
>>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>> >>            !cancel_delayed_work(&sched->work_tdr))
>>> >>                 return NULL;
>>> >
>>> >We never free jobs while the timeout working is running to prevent exactly
>>> >that issue.
>>> >
>>> >Regards,
>>> >Christian.
>>> >
>>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>> >> Hi Christian,
>>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>>> >amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>>> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>> >>
>>> >> Best wishes
>>> >> Emily Deng
>>> >>
>>> >>
>>> >>
>>> >>> -----Original Message-----
>>> >>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> >>> Sent: Friday, November 8, 2019 6:26 PM
>>> >>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>> >>>
>>> >>> Hi Emily,
>>> >>>
>>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>>> >>>
>>> >>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>> >>>
>>> >>> Regards,
>>> >>> Christian.
>>> >>>
>>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>> >>>> Hi Chrisitan,
>>> >>>>        No, I am with the new branch and also has the patch. Even it
>>> >>>> are freed by
>>> >>> main scheduler, how we could avoid main scheduler to free jobs while
>>> >>> enter to function amdgpu_device_gpu_recover?
>>> >>>> Best wishes
>>> >>>> Emily Deng
>>> >>>>
>>> >>>>
>>> >>>>
>>> >>>>> -----Original Message-----
>>> >>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>>> >>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>> >gfx@lists.freedesktop.org
>>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>> >>>>>
>>> >>>>> Hi Emily,
>>> >>>>>
>>> >>>>> in this case you are on an old code branch.
>>> >>>>>
>>> >>>>> Jobs are freed now by the main scheduler thread and only if no
>>> >>>>> timeout handler is running.
>>> >>>>>
>>> >>>>> See this patch here:
>>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>> >>>>>> Author: Christian König <christian.koenig@amd.com>
>>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>> >>>>>>
>>> >>>>>>       drm/scheduler: rework job destruction
>>> >>>>> Regards,
>>> >>>>> Christian.
>>> >>>>>
>>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>> >>>>>> Hi Christian,
>>> >>>>>>         Please refer to follow log, when it enter to
>>> >>>>>> amdgpu_device_gpu_recover
>>> >>>>> function, the bad job 000000005086879e is freeing in function
>>> >>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>> >>> signal.
>>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>> >>>>> s_fence is already freed, but job is not freed in time. Then this issue
>>> >occurs.
>>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>> >>> sdma0
>>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>> >>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>> >>>>> amdgpu
>>> >>>>> 0000:00:08.0: GPU reset begin!
>>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>> >>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process pid 0
>>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process pid 0
>>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process pid 0
>>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process pid 0
>>> >>>>>> thread pid 0,
>>> >>>>> s_job:00000000ea85e922 [ 449.794287]
>>> >>>>> Emily:amdgpu_job_free_cb,Process
>>> >>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>>> >>>>> at
>>> >>>>> 00000000000000c0 [  449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>> >>>>> 0000 [#1] SMP PTI
>>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>> >>>>>> 449.803488]
>>> >>> RIP:
>>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>> >>>>>> ff
>>> >>>>>> 45 85 e4 0f
>>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>> >>>>> <48> 8b
>>> >>> 98
>>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0 R14:
>>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>> >>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000 ES: 0000
>>> >CR0:
>>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0 CR3:
>>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>> >>>>> 0000000000000400 [  449.811937] Call Trace:
>>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>> >>>>>> worker_thread+0x34/0x410 [ 449.814728]  kthread+0x121/0x140 [
>>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>> >>>>>>
>>> >>>>>>> -----Original Message-----
>>> >>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>> >>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>> >>> gfx@lists.freedesktop.org
>>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>> >>>>>>> tdr
>>> >>>>>>>
>>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>> >>>>>>>> Sorry, please take your time.
>>> >>>>>>> Have you seen my other response a bit below?
>>> >>>>>>>
>>> >>>>>>> I can't follow how it would be possible for job->s_fence to be
>>> >>>>>>> NULL without the job also being freed.
>>> >>>>>>>
>>> >>>>>>> So it looks like this patch is just papering over some bigger issues.
>>> >>>>>>>
>>> >>>>>>> Regards,
>>> >>>>>>> Christian.
>>> >>>>>>>
>>> >>>>>>>> Best wishes
>>> >>>>>>>> Emily Deng
>>> >>>>>>>>
>>> >>>>>>>>
>>> >>>>>>>>
>>> >>>>>>>>> -----Original Message-----
>>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>> >>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>> >>>>> gfx@lists.freedesktop.org
>>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>> >>>>>>>>> tdr
>>> >>>>>>>>>
>>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>> >>>>>>>>>> Ping.....
>>> >>>>>>>>> You need to give me at least enough time to wake up :)
>>> >>>>>>>>>
>>> >>>>>>>>>> Best wishes
>>> >>>>>>>>>> Emily Deng
>>> >>>>>>>>>>
>>> >>>>>>>>>>
>>> >>>>>>>>>>
>>> >>>>>>>>>>> -----Original Message-----
>>> >>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>> >>> Behalf
>>> >>>>>>>>>>> Of Deng, Emily
>>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>> >>>>>>>>>>> gfx@lists.freedesktop.org
>>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>> >>>>>>>>>>> for tdr
>>> >>>>>>>>>>>
>>> >>>>>>>>>>>> -----Original Message-----
>>> >>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>> >>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>> >>>>>>>>>>>> for tdr
>>> >>>>>>>>>>>>
>>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>> >>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>> >>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>>> >>>>>>>>>>> go to free
>>> >>>>> job.
>>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>> >>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>> >>>>>>>>> No, that case can't happen. See here:
>>> >>>>>>>>>
>>> >>>>>>>>>>            drm_sched_job_cleanup(s_job);
>>> >>>>>>>>>>
>>> >>>>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
>>> >>>>>>>>>>            dma_fence_put(job->fence);
>>> >>>>>>>>>>            amdgpu_sync_free(&job->sync);
>>> >>>>>>>>>>            amdgpu_sync_free(&job->sched_sync);
>>> >>>>>>>>>>            kfree(job);
>>> >>>>>>>>> The job itself is freed up directly after freeing the reference
>>> >>>>>>>>> to the
>>> >>>>> s_fence.
>>> >>>>>>>>> So you are just papering over a much bigger problem here. This
>>> >>>>>>>>> patch is a clear NAK.
>>> >>>>>>>>>
>>> >>>>>>>>> Regards,
>>> >>>>>>>>> Christian.
>>> >>>>>>>>>
>>> >>>>>>>>>>>> When you see a job without an s_fence then that means the
>>> >>>>>>>>>>>> problem is somewhere else.
>>> >>>>>>>>>>>>
>>> >>>>>>>>>>>> Regards,
>>> >>>>>>>>>>>> Christian.
>>> >>>>>>>>>>>>
>>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>> >>>>>>>>>>>>> ---
>>> >>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>> >>>>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>>> >--
>>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
>>> >>>>>>>>>>>>>
>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>> >>> amdgpu_device_gpu_recover(struct
>>> >>>>>>>>>>>> amdgpu_device *adev,
>>> >>>>>>>>>>>>>            *
>>> >>>>>>>>>>>>>            * job->base holds a reference to parent fence
>>> >>>>>>>>>>>>>            */
>>> >>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>> >>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>> >>>>>>>>>>>>> + job->base.s_fence->parent
>>> >>>>>>> &&
>>> >>>>>>>>>>>>>               dma_fence_is_signaled(job->base.s_fence->parent))
>>> >>>>>>>>>>>>>                   job_signaled = true;
>>> >>>>>>>>>>>>>
>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>> >drm_sched_increase_karma(struct
>>> >>>>>>>>>>>> drm_sched_job
>>> >>>>>>>>>>>>> *bad)
>>> >>>>>>>>>>>>>
>>> >>>>>>>>>>>>>                           spin_lock(&rq->lock);
>>> >>>>>>>>>>>>>                           list_for_each_entry_safe(entity,
>>> >>>>>>>>>>>>> tmp,
>>> >>> &rq-
>>> >>>>>>>> entities,
>>> >>>>>>>>>>>> list) {
>>> >>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
>>> >>>>>>> ==
>>> >>>>>>>>>>>>> -                              entity->fence_context) {
>>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>>> >>>>>>>>>>>>> + (bad->s_fence-
>>> >>>>>>>>>>>>> scheduled.context ==
>>> >>>>>>>>>>>>> + entity->fence_context)) {
>>> >>>>>>>>>>>>>                                           if
>>> >>>>>>>>>>>>> (atomic_read(&bad-
>>> >>>>>>>> karma) >
>>> >>>>>>>>>>>>>                                               bad->sched-
>>> >>>> hang_limit)
>>> >>>>>>>>>>>>>                                                   if
>>> >>>>>>>>>>>>> (entity-
>>> >>>> guilty) @@ -376,7 +376,7 @@ void
>>> >>>>>>>>>>>>> drm_sched_stop(struct
>>> >>>>>>> drm_gpu_scheduler
>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>> >>>>>>>>>>>>>            * This iteration is thread safe as sched thread
>>> >>>>>>>>>>>>> is
>>> >>> stopped.
>>> >>>>>>>>>>>>>            */
>>> >>>>>>>>>>>>>           list_for_each_entry_safe_reverse(s_job, tmp,
>>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>> >>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
>>> >>>>>>>>>>>>>                       dma_fence_remove_callback(s_job-
>>> >>>> s_fence-
>>> >>>>>>>> parent,
>>> >>>>>>>>>>>>>                                                 &s_job->cb)) {
>>> >>>>>>>>>>>>>                           atomic_dec(&sched->hw_rq_count);
>>> >>> @@ -
>>> >>>>>>> 395,7
>>> >>>>>>>>>>> +395,8 @@ void
>>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>> >>>>>>>>>>>>>                            *
>>> >>>>>>>>>>>>>                            * Job is still alive so fence
>>> >>>>>>>>>>>>> refcount at
>>> >>> least 1
>>> >>>>>>>>>>>>>                            */
>>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>> >>>>>>> false);
>>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>> >>>>>>>> finished,
>>> >>>>>>>>>>>> false);
>>> >>>>>>>>>>>>>                           /*
>>> >>>>>>>>>>>>>                            * We must keep bad job alive
>>> >>>>>>>>>>>>> for later
>>> >>> use
>>> >>>>>>> during @@
>>> >>>>>>>>>>>> -438,7
>>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>> >>>>> *sched,
>>> >>>>>>>>>>>>> +bool
>>> >>>>>>>>>>>> full_recovery)
>>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>>> >>>>>>>>>>>>>            */
>>> >>>>>>>>>>>>>           list_for_each_entry_safe(s_job, tmp,
>>> >>>>>>>>>>>>> &sched->ring_mirror_list,
>>> >>>>>>>>>>>>> node)
>>> >>>>>>>>>>>> {
>>> >>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
>>> >>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ?
>>> >>>>>>>>>>>>> + s_job-
>>> >>>>>>>> s_fence-
>>> >>>>>>>>>>>>> parent :
>>> >>>>>>>>>>>>> +NULL;
>>> >>>>>>>>>>>>>
>>> >>>>>>>>>>>>>                   atomic_inc(&sched->hw_rq_count);
>>> >>>>>>>>>>>>>
>>> >>>>>>>>>>> _______________________________________________
>>> >>>>>>>>>>> amd-gfx mailing list
>>> >>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>> >>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>> >
>>> >_______________________________________________
>>> >amd-gfx mailing list
>>> >amd-gfx@lists.freedesktop.org
>>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[-- Attachment #1.2: Type: text/html, Size: 54446 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-13 14:12                                                                         ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-13 14:12 UTC (permalink / raw)
  To: christian.koenig-5C7GfCeVMHo, Deng, Emily,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 25213 bytes --]

This why I asked for a trace with timer enabled, but since there is a 
finite number of places we touch the timer Emily can just put prints 
there. Also, I wonder if this temp fix helps her with the issue or not.

Andrey

On 11/13/19 2:36 AM, Christian König wrote:
> The question is where do we rearm the timer for this problem to occur?
>
> Regards,
> Christian.
>
> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>
>> I was able to reproduce the crash by using the attached 
>> simulate_crash.patch - waiting on guilty job to signal in reset work 
>> and artificially rearming the timeout timer just before the check for 
>> !cancel_delayed_work(&sched->work_tdr)  in drm_sched_cleanup_jobs - 
>> crash log attached in crash.log. This I think confirms my theory i 
>> described earlier in this thread.
>>
>> basic_fix.patch handles this by testing whether another timer already 
>> armed ob this scheduler or is there a timeout work in execution right 
>> now (see documentation for work_busy) - obviously  this is not a full 
>> solution as this will not protect from races if for example there is 
>> immediate work scheduling such as in drm_sched_fault -  so we 
>> probably need to account for this by making drm_sched_cleanup_jobs 
>> (at least in the part where it iterates ring mirror list and frees 
>> jobs) and GPU reset really mutually exclusive and not like now.
>>
>> Andrey
>>
>>
>> On 11/11/19 4:11 PM, Christian König wrote:
>>> Hi Emily,
>>>
>>> you need to print which scheduler instance is freeing the jobs and 
>>> which one is triggering the reset. The TID and PID is completely 
>>> meaningless here since we are called from different worker threads 
>>> and the TID/PID can change on each call.
>>>
>>> Apart from that I will look into this a bit deeper when I have time.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>> Hi Christian,
>>>>     I add the follow print in function drm_sched_cleanup_jobs. From 
>>>> the log it shows that only use cancel_delayed_work could not avoid 
>>>> to free job when the sched is in reset. But don’t know exactly 
>>>> where it is wrong about the driver. Do you have any suggestion 
>>>> about this?
>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, pid:%lu\n", 
>>>> current->tgid, current->pid);
>>>>         /*
>>>>          * Don't destroy jobs while the timeout worker is running  
>>>> OR thread
>>>>          * is being parked and hence assumed to not touch 
>>>> ring_mirror_list
>>>>          */
>>>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>                 return;
>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
>>>> current->tgid, current->pid);
>>>> Best wishes
>>>> Emily Deng
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring 
>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
>>>> information: process  pid 0 thread pid 0, 
>>>> s_job:00000000fe75ab36,tid=15603, pid=15603
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information: 
>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, 
>>>> pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, pid:2253
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information: 
>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, 
>>>> pid:2262
>>>> >-----Original Message-----
>>>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
>>>> >Sent: Tuesday, November 12, 2019 11:28 AM
>>>> >To: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>; Deng, Emily
>>>> ><Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>> >
>>>> >Thinking more about this claim - we assume here that if cancel_delayed_work
>>>> >returned true it guarantees that timeout work is not running but, it merely
>>>> >means there was a pending timeout work which was removed from the
>>>> >workqueue before it's timer elapsed and so it didn't have a chance to be
>>>> >dequeued and executed, it doesn't cover already executing work. So there is a
>>>> >possibility where while timeout work started executing another timeout work
>>>> >already got enqueued (maybe through earlier cleanup jobs or through
>>>> >drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs
>>>> >cancel_delayed_work(&sched->work_tdr) will return true even while there is a
>>>> >timeout job in progress.
>>>> >Unfortunately we cannot change cancel_delayed_work to
>>>> >cancel_delayed_work_sync to flush the timeout work as timeout work itself
>>>> >waits for schedule thread  to be parked again when calling park_thread.
>>>> >
>>>> >Andrey
>>>> >
>>>> >________________________________________
>>>> >From: amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> on behalf of
>>>> >Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>> >Sent: 08 November 2019 05:35:18
>>>> >To: Deng, Emily; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>> >
>>>> >Hi Emily,
>>>> >
>>>> >exactly that can't happen. See here:
>>>> >
>>>> >>         /* Don't destroy jobs while the timeout worker is running */
>>>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>> >>            !cancel_delayed_work(&sched->work_tdr))
>>>> >>                 return NULL;
>>>> >
>>>> >We never free jobs while the timeout working is running to prevent exactly
>>>> >that issue.
>>>> >
>>>> >Regards,
>>>> >Christian.
>>>> >
>>>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>> >> Hi Christian,
>>>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>> >amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>>>> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>> >>
>>>> >> Best wishes
>>>> >> Emily Deng
>>>> >>
>>>> >>
>>>> >>
>>>> >>> -----Original Message-----
>>>> >>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>> >>> Sent: Friday, November 8, 2019 6:26 PM
>>>> >>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>> >>>
>>>> >>> Hi Emily,
>>>> >>>
>>>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>> >>>
>>>> >>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>> >>>
>>>> >>> Regards,
>>>> >>> Christian.
>>>> >>>
>>>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>> >>>> Hi Chrisitan,
>>>> >>>>        No, I am with the new branch and also has the patch. Even it
>>>> >>>> are freed by
>>>> >>> main scheduler, how we could avoid main scheduler to free jobs while
>>>> >>> enter to function amdgpu_device_gpu_recover?
>>>> >>>> Best wishes
>>>> >>>> Emily Deng
>>>> >>>>
>>>> >>>>
>>>> >>>>
>>>> >>>>> -----Original Message-----
>>>> >>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>> >>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>>>> >gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>> >>>>>
>>>> >>>>> Hi Emily,
>>>> >>>>>
>>>> >>>>> in this case you are on an old code branch.
>>>> >>>>>
>>>> >>>>> Jobs are freed now by the main scheduler thread and only if no
>>>> >>>>> timeout handler is running.
>>>> >>>>>
>>>> >>>>> See this patch here:
>>>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>> >>>>>> Author: Christian König <christian.koenig-5C7GfCeVMHo@public.gmane.org>
>>>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>> >>>>>>
>>>> >>>>>>       drm/scheduler: rework job destruction
>>>> >>>>> Regards,
>>>> >>>>> Christian.
>>>> >>>>>
>>>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>> >>>>>> Hi Christian,
>>>> >>>>>>         Please refer to follow log, when it enter to
>>>> >>>>>> amdgpu_device_gpu_recover
>>>> >>>>> function, the bad job 000000005086879e is freeing in function
>>>> >>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>>> >>> signal.
>>>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>> >>>>> s_fence is already freed, but job is not freed in time. Then this issue
>>>> >occurs.
>>>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>> >>> sdma0
>>>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>> >>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>> >>>>> amdgpu
>>>> >>>>> 0000:00:08.0: GPU reset begin!
>>>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>> >>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process pid 0
>>>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process pid 0
>>>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process pid 0
>>>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process pid 0
>>>> >>>>>> thread pid 0,
>>>> >>>>> s_job:00000000ea85e922 [ 449.794287]
>>>> >>>>> Emily:amdgpu_job_free_cb,Process
>>>> >>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>>>> >>>>> at
>>>> >>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>> >>>>> 0000 [#1] SMP PTI
>>>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>> >>>>>> 449.803488]
>>>> >>> RIP:
>>>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>> >>>>>> ff
>>>> >>>>>> 45 85 e4 0f
>>>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>> >>>>> <48> 8b
>>>> >>> 98
>>>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0 R14:
>>>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>> >>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000 ES: 0000
>>>> >CR0:
>>>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0 CR3:
>>>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>> >>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>>> >>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>> >>>>>>
>>>> >>>>>>> -----Original Message-----
>>>> >>>>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>> >>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>>>> >>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>> >>>>>>> tdr
>>>> >>>>>>>
>>>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>> >>>>>>>> Sorry, please take your time.
>>>> >>>>>>> Have you seen my other response a bit below?
>>>> >>>>>>>
>>>> >>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>> >>>>>>> NULL without the job also being freed.
>>>> >>>>>>>
>>>> >>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>> >>>>>>>
>>>> >>>>>>> Regards,
>>>> >>>>>>> Christian.
>>>> >>>>>>>
>>>> >>>>>>>> Best wishes
>>>> >>>>>>>> Emily Deng
>>>> >>>>>>>>
>>>> >>>>>>>>
>>>> >>>>>>>>
>>>> >>>>>>>>> -----Original Message-----
>>>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>> >>>>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>>>> >>>>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>> >>>>>>>>> tdr
>>>> >>>>>>>>>
>>>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>> >>>>>>>>>> Ping.....
>>>> >>>>>>>>> You need to give me at least enough time to wake up :)
>>>> >>>>>>>>>
>>>> >>>>>>>>>> Best wishes
>>>> >>>>>>>>>> Emily Deng
>>>> >>>>>>>>>>
>>>> >>>>>>>>>>
>>>> >>>>>>>>>>
>>>> >>>>>>>>>>> -----Original Message-----
>>>> >>>>>>>>>>> From: amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> On
>>>> >>> Behalf
>>>> >>>>>>>>>>> Of Deng, Emily
>>>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>; amd-
>>>> >>>>>>>>>>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>> >>>>>>>>>>> for tdr
>>>> >>>>>>>>>>>
>>>> >>>>>>>>>>>> -----Original Message-----
>>>> >>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>>>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>;
>>>> >>>>>>>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>> >>>>>>>>>>>> for tdr
>>>> >>>>>>>>>>>>
>>>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>> >>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>>>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>> >>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>>>> >>>>>>>>>>> go to free
>>>> >>>>> job.
>>>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>> >>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>> >>>>>>>>> No, that case can't happen. See here:
>>>> >>>>>>>>>
>>>> >>>>>>>>>>            drm_sched_job_cleanup(s_job);
>>>> >>>>>>>>>>
>>>> >>>>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>> >>>>>>>>>>            dma_fence_put(job->fence);
>>>> >>>>>>>>>>            amdgpu_sync_free(&job->sync);
>>>> >>>>>>>>>>            amdgpu_sync_free(&job->sched_sync);
>>>> >>>>>>>>>>            kfree(job);
>>>> >>>>>>>>> The job itself is freed up directly after freeing the reference
>>>> >>>>>>>>> to the
>>>> >>>>> s_fence.
>>>> >>>>>>>>> So you are just papering over a much bigger problem here. This
>>>> >>>>>>>>> patch is a clear NAK.
>>>> >>>>>>>>>
>>>> >>>>>>>>> Regards,
>>>> >>>>>>>>> Christian.
>>>> >>>>>>>>>
>>>> >>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>> >>>>>>>>>>>> problem is somewhere else.
>>>> >>>>>>>>>>>>
>>>> >>>>>>>>>>>> Regards,
>>>> >>>>>>>>>>>> Christian.
>>>> >>>>>>>>>>>>
>>>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng-5C7GfCeVMHo@public.gmane.org>
>>>> >>>>>>>>>>>>> ---
>>>> >>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>> >>>>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>>>> >--
>>>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
>>>> >>>>>>>>>>>>>
>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>> >>> amdgpu_device_gpu_recover(struct
>>>> >>>>>>>>>>>> amdgpu_device *adev,
>>>> >>>>>>>>>>>>>            *
>>>> >>>>>>>>>>>>>            * job->base holds a reference to parent fence
>>>> >>>>>>>>>>>>>            */
>>>> >>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>> >>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>> >>>>>>>>>>>>> + job->base.s_fence->parent
>>>> >>>>>>> &&
>>>> >>>>>>>>>>>>>               dma_fence_is_signaled(job->base.s_fence->parent))
>>>> >>>>>>>>>>>>>                   job_signaled = true;
>>>> >>>>>>>>>>>>>
>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>> >drm_sched_increase_karma(struct
>>>> >>>>>>>>>>>> drm_sched_job
>>>> >>>>>>>>>>>>> *bad)
>>>> >>>>>>>>>>>>>
>>>> >>>>>>>>>>>>>                           spin_lock(&rq->lock);
>>>> >>>>>>>>>>>>>                           list_for_each_entry_safe(entity,
>>>> >>>>>>>>>>>>> tmp,
>>>> >>> &rq-
>>>> >>>>>>>> entities,
>>>> >>>>>>>>>>>> list) {
>>>> >>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
>>>> >>>>>>> ==
>>>> >>>>>>>>>>>>> - entity->fence_context) {
>>>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>> >>>>>>>>>>>>> + (bad->s_fence-
>>>> >>>>>>>>>>>>> scheduled.context ==
>>>> >>>>>>>>>>>>> + entity->fence_context)) {
>>>> >>>>>>>>>>>>>                                           if
>>>> >>>>>>>>>>>>> (atomic_read(&bad-
>>>> >>>>>>>> karma) >
>>>> >>>>>>>>>>>>>                                               bad->sched-
>>>> >>>> hang_limit)
>>>> >>>>>>>>>>>>>                                                   if
>>>> >>>>>>>>>>>>> (entity-
>>>> >>>> guilty) @@ -376,7 +376,7 @@ void
>>>> >>>>>>>>>>>>> drm_sched_stop(struct
>>>> >>>>>>> drm_gpu_scheduler
>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>> >>>>>>>>>>>>>            * This iteration is thread safe as sched thread
>>>> >>>>>>>>>>>>> is
>>>> >>> stopped.
>>>> >>>>>>>>>>>>>            */
>>>> >>>>>>>>>>>>>           list_for_each_entry_safe_reverse(s_job, tmp,
>>>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>> >>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
>>>> >>>>>>>>>>>>>                       dma_fence_remove_callback(s_job-
>>>> >>>> s_fence-
>>>> >>>>>>>> parent,
>>>> >>>>>>>>>>>>>                                                 &s_job->cb)) {
>>>> >>>>>>>>>>>>>                           atomic_dec(&sched->hw_rq_count);
>>>> >>> @@ -
>>>> >>>>>>> 395,7
>>>> >>>>>>>>>>> +395,8 @@ void
>>>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>> >>>>>>>>>>>>>                            *
>>>> >>>>>>>>>>>>>                            * Job is still alive so fence
>>>> >>>>>>>>>>>>> refcount at
>>>> >>> least 1
>>>> >>>>>>>>>>>>>                            */
>>>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>> >>>>>>> false);
>>>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>> >>>>>>>> finished,
>>>> >>>>>>>>>>>> false);
>>>> >>>>>>>>>>>>>                           /*
>>>> >>>>>>>>>>>>>                            * We must keep bad job alive
>>>> >>>>>>>>>>>>> for later
>>>> >>> use
>>>> >>>>>>> during @@
>>>> >>>>>>>>>>>> -438,7
>>>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>> >>>>> *sched,
>>>> >>>>>>>>>>>>> +bool
>>>> >>>>>>>>>>>> full_recovery)
>>>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>>>> >>>>>>>>>>>>>            */
>>>> >>>>>>>>>>>>>           list_for_each_entry_safe(s_job, tmp,
>>>> >>>>>>>>>>>>> &sched->ring_mirror_list,
>>>> >>>>>>>>>>>>> node)
>>>> >>>>>>>>>>>> {
>>>> >>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
>>>> >>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ?
>>>> >>>>>>>>>>>>> + s_job-
>>>> >>>>>>>> s_fence-
>>>> >>>>>>>>>>>>> parent :
>>>> >>>>>>>>>>>>> +NULL;
>>>> >>>>>>>>>>>>>
>>>> >>>>>>>>>>>>>                   atomic_inc(&sched->hw_rq_count);
>>>> >>>>>>>>>>>>>
>>>> >>>>>>>>>>> _______________________________________________
>>>> >>>>>>>>>>> amd-gfx mailing list
>>>> >>>>>>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>> >>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>> >
>>>> >_______________________________________________
>>>> >amd-gfx mailing list
>>>> >amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>

[-- Attachment #1.2: Type: text/html, Size: 61867 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-13 14:12                                                                         ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-13 14:12 UTC (permalink / raw)
  To: christian.koenig, Deng, Emily, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 24483 bytes --]

This why I asked for a trace with timer enabled, but since there is a 
finite number of places we touch the timer Emily can just put prints 
there. Also, I wonder if this temp fix helps her with the issue or not.

Andrey

On 11/13/19 2:36 AM, Christian König wrote:
> The question is where do we rearm the timer for this problem to occur?
>
> Regards,
> Christian.
>
> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>
>> I was able to reproduce the crash by using the attached 
>> simulate_crash.patch - waiting on guilty job to signal in reset work 
>> and artificially rearming the timeout timer just before the check for 
>> !cancel_delayed_work(&sched->work_tdr)  in drm_sched_cleanup_jobs - 
>> crash log attached in crash.log. This I think confirms my theory i 
>> described earlier in this thread.
>>
>> basic_fix.patch handles this by testing whether another timer already 
>> armed ob this scheduler or is there a timeout work in execution right 
>> now (see documentation for work_busy) - obviously  this is not a full 
>> solution as this will not protect from races if for example there is 
>> immediate work scheduling such as in drm_sched_fault -  so we 
>> probably need to account for this by making drm_sched_cleanup_jobs 
>> (at least in the part where it iterates ring mirror list and frees 
>> jobs) and GPU reset really mutually exclusive and not like now.
>>
>> Andrey
>>
>>
>> On 11/11/19 4:11 PM, Christian König wrote:
>>> Hi Emily,
>>>
>>> you need to print which scheduler instance is freeing the jobs and 
>>> which one is triggering the reset. The TID and PID is completely 
>>> meaningless here since we are called from different worker threads 
>>> and the TID/PID can change on each call.
>>>
>>> Apart from that I will look into this a bit deeper when I have time.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>> Hi Christian,
>>>>     I add the follow print in function drm_sched_cleanup_jobs. From 
>>>> the log it shows that only use cancel_delayed_work could not avoid 
>>>> to free job when the sched is in reset. But don’t know exactly 
>>>> where it is wrong about the driver. Do you have any suggestion 
>>>> about this?
>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, pid:%lu\n", 
>>>> current->tgid, current->pid);
>>>>         /*
>>>>          * Don't destroy jobs while the timeout worker is running  
>>>> OR thread
>>>>          * is being parked and hence assumed to not touch 
>>>> ring_mirror_list
>>>>          */
>>>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>                 return;
>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
>>>> current->tgid, current->pid);
>>>> Best wishes
>>>> Emily Deng
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring 
>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
>>>> information: process  pid 0 thread pid 0, 
>>>> s_job:00000000fe75ab36,tid=15603, pid=15603
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information: 
>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, 
>>>> pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, pid:2253
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information: 
>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, 
>>>> pid:2262
>>>> >-----Original Message-----
>>>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>> >Sent: Tuesday, November 12, 2019 11:28 AM
>>>> >To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>> ><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>> >
>>>> >Thinking more about this claim - we assume here that if cancel_delayed_work
>>>> >returned true it guarantees that timeout work is not running but, it merely
>>>> >means there was a pending timeout work which was removed from the
>>>> >workqueue before it's timer elapsed and so it didn't have a chance to be
>>>> >dequeued and executed, it doesn't cover already executing work. So there is a
>>>> >possibility where while timeout work started executing another timeout work
>>>> >already got enqueued (maybe through earlier cleanup jobs or through
>>>> >drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs
>>>> >cancel_delayed_work(&sched->work_tdr) will return true even while there is a
>>>> >timeout job in progress.
>>>> >Unfortunately we cannot change cancel_delayed_work to
>>>> >cancel_delayed_work_sync to flush the timeout work as timeout work itself
>>>> >waits for schedule thread  to be parked again when calling park_thread.
>>>> >
>>>> >Andrey
>>>> >
>>>> >________________________________________
>>>> >From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of
>>>> >Koenig, Christian <Christian.Koenig@amd.com>
>>>> >Sent: 08 November 2019 05:35:18
>>>> >To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>> >
>>>> >Hi Emily,
>>>> >
>>>> >exactly that can't happen. See here:
>>>> >
>>>> >>         /* Don't destroy jobs while the timeout worker is running */
>>>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>> >>            !cancel_delayed_work(&sched->work_tdr))
>>>> >>                 return NULL;
>>>> >
>>>> >We never free jobs while the timeout working is running to prevent exactly
>>>> >that issue.
>>>> >
>>>> >Regards,
>>>> >Christian.
>>>> >
>>>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>> >> Hi Christian,
>>>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>> >amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>>>> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>> >>
>>>> >> Best wishes
>>>> >> Emily Deng
>>>> >>
>>>> >>
>>>> >>
>>>> >>> -----Original Message-----
>>>> >>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> >>> Sent: Friday, November 8, 2019 6:26 PM
>>>> >>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>> >>>
>>>> >>> Hi Emily,
>>>> >>>
>>>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>> >>>
>>>> >>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>> >>>
>>>> >>> Regards,
>>>> >>> Christian.
>>>> >>>
>>>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>> >>>> Hi Chrisitan,
>>>> >>>>        No, I am with the new branch and also has the patch. Even it
>>>> >>>> are freed by
>>>> >>> main scheduler, how we could avoid main scheduler to free jobs while
>>>> >>> enter to function amdgpu_device_gpu_recover?
>>>> >>>> Best wishes
>>>> >>>> Emily Deng
>>>> >>>>
>>>> >>>>
>>>> >>>>
>>>> >>>>> -----Original Message-----
>>>> >>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>> >>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>> >gfx@lists.freedesktop.org
>>>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>> >>>>>
>>>> >>>>> Hi Emily,
>>>> >>>>>
>>>> >>>>> in this case you are on an old code branch.
>>>> >>>>>
>>>> >>>>> Jobs are freed now by the main scheduler thread and only if no
>>>> >>>>> timeout handler is running.
>>>> >>>>>
>>>> >>>>> See this patch here:
>>>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>> >>>>>> Author: Christian König <christian.koenig@amd.com>
>>>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>> >>>>>>
>>>> >>>>>>       drm/scheduler: rework job destruction
>>>> >>>>> Regards,
>>>> >>>>> Christian.
>>>> >>>>>
>>>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>> >>>>>> Hi Christian,
>>>> >>>>>>         Please refer to follow log, when it enter to
>>>> >>>>>> amdgpu_device_gpu_recover
>>>> >>>>> function, the bad job 000000005086879e is freeing in function
>>>> >>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>>> >>> signal.
>>>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>> >>>>> s_fence is already freed, but job is not freed in time. Then this issue
>>>> >occurs.
>>>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>> >>> sdma0
>>>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>> >>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>> >>>>> amdgpu
>>>> >>>>> 0000:00:08.0: GPU reset begin!
>>>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>> >>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [  449.794221]
>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process pid 0
>>>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process pid 0
>>>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process pid 0
>>>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process pid 0
>>>> >>>>>> thread pid 0,
>>>> >>>>> s_job:00000000ea85e922 [ 449.794287]
>>>> >>>>> Emily:amdgpu_job_free_cb,Process
>>>> >>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>>>> >>>>> at
>>>> >>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>> >>>>> 0000 [#1] SMP PTI
>>>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>> >>>>>> 449.803488]
>>>> >>> RIP:
>>>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>> >>>>>> ff
>>>> >>>>>> 45 85 e4 0f
>>>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>> >>>>> <48> 8b
>>>> >>> 98
>>>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0 R14:
>>>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>> >>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000 ES: 0000
>>>> >CR0:
>>>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0 CR3:
>>>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>> >>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>>> >>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>> >>>>>>
>>>> >>>>>>> -----Original Message-----
>>>> >>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>> >>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>> >>> gfx@lists.freedesktop.org
>>>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>> >>>>>>> tdr
>>>> >>>>>>>
>>>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>> >>>>>>>> Sorry, please take your time.
>>>> >>>>>>> Have you seen my other response a bit below?
>>>> >>>>>>>
>>>> >>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>> >>>>>>> NULL without the job also being freed.
>>>> >>>>>>>
>>>> >>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>> >>>>>>>
>>>> >>>>>>> Regards,
>>>> >>>>>>> Christian.
>>>> >>>>>>>
>>>> >>>>>>>> Best wishes
>>>> >>>>>>>> Emily Deng
>>>> >>>>>>>>
>>>> >>>>>>>>
>>>> >>>>>>>>
>>>> >>>>>>>>> -----Original Message-----
>>>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>> >>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>> >>>>> gfx@lists.freedesktop.org
>>>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>> >>>>>>>>> tdr
>>>> >>>>>>>>>
>>>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>> >>>>>>>>>> Ping.....
>>>> >>>>>>>>> You need to give me at least enough time to wake up :)
>>>> >>>>>>>>>
>>>> >>>>>>>>>> Best wishes
>>>> >>>>>>>>>> Emily Deng
>>>> >>>>>>>>>>
>>>> >>>>>>>>>>
>>>> >>>>>>>>>>
>>>> >>>>>>>>>>> -----Original Message-----
>>>> >>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>>> >>> Behalf
>>>> >>>>>>>>>>> Of Deng, Emily
>>>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>> >>>>>>>>>>> gfx@lists.freedesktop.org
>>>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>> >>>>>>>>>>> for tdr
>>>> >>>>>>>>>>>
>>>> >>>>>>>>>>>> -----Original Message-----
>>>> >>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>> >>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>> >>>>>>>>>>>> for tdr
>>>> >>>>>>>>>>>>
>>>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>> >>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>>>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>> >>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>>>> >>>>>>>>>>> go to free
>>>> >>>>> job.
>>>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>> >>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>> >>>>>>>>> No, that case can't happen. See here:
>>>> >>>>>>>>>
>>>> >>>>>>>>>>            drm_sched_job_cleanup(s_job);
>>>> >>>>>>>>>>
>>>> >>>>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>> >>>>>>>>>>            dma_fence_put(job->fence);
>>>> >>>>>>>>>>            amdgpu_sync_free(&job->sync);
>>>> >>>>>>>>>>            amdgpu_sync_free(&job->sched_sync);
>>>> >>>>>>>>>>            kfree(job);
>>>> >>>>>>>>> The job itself is freed up directly after freeing the reference
>>>> >>>>>>>>> to the
>>>> >>>>> s_fence.
>>>> >>>>>>>>> So you are just papering over a much bigger problem here. This
>>>> >>>>>>>>> patch is a clear NAK.
>>>> >>>>>>>>>
>>>> >>>>>>>>> Regards,
>>>> >>>>>>>>> Christian.
>>>> >>>>>>>>>
>>>> >>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>> >>>>>>>>>>>> problem is somewhere else.
>>>> >>>>>>>>>>>>
>>>> >>>>>>>>>>>> Regards,
>>>> >>>>>>>>>>>> Christian.
>>>> >>>>>>>>>>>>
>>>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>> >>>>>>>>>>>>> ---
>>>> >>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>> >>>>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>>>> >--
>>>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
>>>> >>>>>>>>>>>>>
>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>> >>> amdgpu_device_gpu_recover(struct
>>>> >>>>>>>>>>>> amdgpu_device *adev,
>>>> >>>>>>>>>>>>>            *
>>>> >>>>>>>>>>>>>            * job->base holds a reference to parent fence
>>>> >>>>>>>>>>>>>            */
>>>> >>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>> >>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>> >>>>>>>>>>>>> + job->base.s_fence->parent
>>>> >>>>>>> &&
>>>> >>>>>>>>>>>>>               dma_fence_is_signaled(job->base.s_fence->parent))
>>>> >>>>>>>>>>>>>                   job_signaled = true;
>>>> >>>>>>>>>>>>>
>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>> >drm_sched_increase_karma(struct
>>>> >>>>>>>>>>>> drm_sched_job
>>>> >>>>>>>>>>>>> *bad)
>>>> >>>>>>>>>>>>>
>>>> >>>>>>>>>>>>>                           spin_lock(&rq->lock);
>>>> >>>>>>>>>>>>>                           list_for_each_entry_safe(entity,
>>>> >>>>>>>>>>>>> tmp,
>>>> >>> &rq-
>>>> >>>>>>>> entities,
>>>> >>>>>>>>>>>> list) {
>>>> >>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
>>>> >>>>>>> ==
>>>> >>>>>>>>>>>>> - entity->fence_context) {
>>>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>> >>>>>>>>>>>>> + (bad->s_fence-
>>>> >>>>>>>>>>>>> scheduled.context ==
>>>> >>>>>>>>>>>>> + entity->fence_context)) {
>>>> >>>>>>>>>>>>>                                           if
>>>> >>>>>>>>>>>>> (atomic_read(&bad-
>>>> >>>>>>>> karma) >
>>>> >>>>>>>>>>>>>                                               bad->sched-
>>>> >>>> hang_limit)
>>>> >>>>>>>>>>>>>                                                   if
>>>> >>>>>>>>>>>>> (entity-
>>>> >>>> guilty) @@ -376,7 +376,7 @@ void
>>>> >>>>>>>>>>>>> drm_sched_stop(struct
>>>> >>>>>>> drm_gpu_scheduler
>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>> >>>>>>>>>>>>>            * This iteration is thread safe as sched thread
>>>> >>>>>>>>>>>>> is
>>>> >>> stopped.
>>>> >>>>>>>>>>>>>            */
>>>> >>>>>>>>>>>>>           list_for_each_entry_safe_reverse(s_job, tmp,
>>>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>> >>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
>>>> >>>>>>>>>>>>>                       dma_fence_remove_callback(s_job-
>>>> >>>> s_fence-
>>>> >>>>>>>> parent,
>>>> >>>>>>>>>>>>>                                                 &s_job->cb)) {
>>>> >>>>>>>>>>>>>                           atomic_dec(&sched->hw_rq_count);
>>>> >>> @@ -
>>>> >>>>>>> 395,7
>>>> >>>>>>>>>>> +395,8 @@ void
>>>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>> >>>>>>>>>>>>>                            *
>>>> >>>>>>>>>>>>>                            * Job is still alive so fence
>>>> >>>>>>>>>>>>> refcount at
>>>> >>> least 1
>>>> >>>>>>>>>>>>>                            */
>>>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>> >>>>>>> false);
>>>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>> >>>>>>>> finished,
>>>> >>>>>>>>>>>> false);
>>>> >>>>>>>>>>>>>                           /*
>>>> >>>>>>>>>>>>>                            * We must keep bad job alive
>>>> >>>>>>>>>>>>> for later
>>>> >>> use
>>>> >>>>>>> during @@
>>>> >>>>>>>>>>>> -438,7
>>>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>> >>>>> *sched,
>>>> >>>>>>>>>>>>> +bool
>>>> >>>>>>>>>>>> full_recovery)
>>>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>>>> >>>>>>>>>>>>>            */
>>>> >>>>>>>>>>>>>           list_for_each_entry_safe(s_job, tmp,
>>>> >>>>>>>>>>>>> &sched->ring_mirror_list,
>>>> >>>>>>>>>>>>> node)
>>>> >>>>>>>>>>>> {
>>>> >>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
>>>> >>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ?
>>>> >>>>>>>>>>>>> + s_job-
>>>> >>>>>>>> s_fence-
>>>> >>>>>>>>>>>>> parent :
>>>> >>>>>>>>>>>>> +NULL;
>>>> >>>>>>>>>>>>>
>>>> >>>>>>>>>>>>>                   atomic_inc(&sched->hw_rq_count);
>>>> >>>>>>>>>>>>>
>>>> >>>>>>>>>>> _______________________________________________
>>>> >>>>>>>>>>> amd-gfx mailing list
>>>> >>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>> >>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>> >
>>>> >_______________________________________________
>>>> >amd-gfx mailing list
>>>> >amd-gfx@lists.freedesktop.org
>>>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>

[-- Attachment #1.2: Type: text/html, Size: 60303 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-13 14:20                                                                             ` Christian König
  0 siblings, 0 replies; 80+ messages in thread
From: Christian König @ 2019-11-13 14:20 UTC (permalink / raw)
  To: Andrey Grodzovsky, Deng, Emily, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 26387 bytes --]

Another more fundamental question: Could we get rid of the timeout job 
at all?

I mean we used to give this as parameter to the scheduler callback 
because we had the timeout worker in the job, but that is no longer the 
case.

E.g. in drm_sched_job_timedout() we do the following:
>         job = list_first_entry_or_null(&sched->ring_mirror_list,
>                                        struct drm_sched_job, node);

Why don't we just remove that here and only get the first job after we 
have stopped the scheduler?

Regards,
Christian.

Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>
> This why I asked for a trace with timer enabled, but since there is a 
> finite number of places we touch the timer Emily can just put prints 
> there. Also, I wonder if this temp fix helps her with the issue or not.
>
> Andrey
>
> On 11/13/19 2:36 AM, Christian König wrote:
>> The question is where do we rearm the timer for this problem to occur?
>>
>> Regards,
>> Christian.
>>
>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>
>>> I was able to reproduce the crash by using the attached 
>>> simulate_crash.patch - waiting on guilty job to signal in reset work 
>>> and artificially rearming the timeout timer just before the check 
>>> for !cancel_delayed_work(&sched->work_tdr)  in 
>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I 
>>> think confirms my theory i described earlier in this thread.
>>>
>>> basic_fix.patch handles this by testing whether another timer 
>>> already armed ob this scheduler or is there a timeout work in 
>>> execution right now (see documentation for work_busy) - obviously  
>>> this is not a full solution as this will not protect from races if 
>>> for example there is immediate work scheduling such as in 
>>> drm_sched_fault -  so we probably need to account for this by making 
>>> drm_sched_cleanup_jobs (at least in the part where it iterates ring 
>>> mirror list and frees jobs) and GPU reset really mutually exclusive 
>>> and not like now.
>>>
>>> Andrey
>>>
>>>
>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>> Hi Emily,
>>>>
>>>> you need to print which scheduler instance is freeing the jobs and 
>>>> which one is triggering the reset. The TID and PID is completely 
>>>> meaningless here since we are called from different worker threads 
>>>> and the TID/PID can change on each call.
>>>>
>>>> Apart from that I will look into this a bit deeper when I have time.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>> Hi Christian,
>>>>>     I add the follow print in function drm_sched_cleanup_jobs. 
>>>>> From the log it shows that only use cancel_delayed_work could not 
>>>>> avoid to free job when the sched is in reset. But don’t know 
>>>>> exactly where it is wrong about the driver. Do you have any 
>>>>> suggestion about this?
>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, pid:%lu\n", 
>>>>> current->tgid, current->pid);
>>>>>         /*
>>>>>          * Don't destroy jobs while the timeout worker is running  
>>>>> OR thread
>>>>>          * is being parked and hence assumed to not touch 
>>>>> ring_mirror_list
>>>>>          */
>>>>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>                 return;
>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
>>>>> current->tgid, current->pid);
>>>>> Best wishes
>>>>> Emily Deng
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring 
>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
>>>>> information: process  pid 0 thread pid 0, 
>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information: 
>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, 
>>>>> pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, pid:2253
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information: 
>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, 
>>>>> pid:2262
>>>>> >-----Original Message-----
>>>>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
>>>>> >Sent: Tuesday, November 12, 2019 11:28 AM
>>>>> >To: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>; Deng, Emily
>>>>> ><Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>> >
>>>>> >Thinking more about this claim - we assume here that if cancel_delayed_work
>>>>> >returned true it guarantees that timeout work is not running but, it merely
>>>>> >means there was a pending timeout work which was removed from the
>>>>> >workqueue before it's timer elapsed and so it didn't have a chance to be
>>>>> >dequeued and executed, it doesn't cover already executing work. So there is a
>>>>> >possibility where while timeout work started executing another timeout work
>>>>> >already got enqueued (maybe through earlier cleanup jobs or through
>>>>> >drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs
>>>>> >cancel_delayed_work(&sched->work_tdr) will return true even while there is a
>>>>> >timeout job in progress.
>>>>> >Unfortunately we cannot change cancel_delayed_work to
>>>>> >cancel_delayed_work_sync to flush the timeout work as timeout work itself
>>>>> >waits for schedule thread  to be parked again when calling park_thread.
>>>>> >
>>>>> >Andrey
>>>>> >
>>>>> >________________________________________
>>>>> >From: amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> on behalf of
>>>>> >Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>> >Sent: 08 November 2019 05:35:18
>>>>> >To: Deng, Emily; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>> >
>>>>> >Hi Emily,
>>>>> >
>>>>> >exactly that can't happen. See here:
>>>>> >
>>>>> >>         /* Don't destroy jobs while the timeout worker is running */
>>>>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>> >>            !cancel_delayed_work(&sched->work_tdr))
>>>>> >>                 return NULL;
>>>>> >
>>>>> >We never free jobs while the timeout working is running to prevent exactly
>>>>> >that issue.
>>>>> >
>>>>> >Regards,
>>>>> >Christian.
>>>>> >
>>>>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>> >> Hi Christian,
>>>>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>>> >amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>>>>> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>>> >>
>>>>> >> Best wishes
>>>>> >> Emily Deng
>>>>> >>
>>>>> >>
>>>>> >>
>>>>> >>> -----Original Message-----
>>>>> >>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>> >>> Sent: Friday, November 8, 2019 6:26 PM
>>>>> >>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>> >>>
>>>>> >>> Hi Emily,
>>>>> >>>
>>>>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>> >>>
>>>>> >>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>>> >>>
>>>>> >>> Regards,
>>>>> >>> Christian.
>>>>> >>>
>>>>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>> >>>> Hi Chrisitan,
>>>>> >>>>        No, I am with the new branch and also has the patch. Even it
>>>>> >>>> are freed by
>>>>> >>> main scheduler, how we could avoid main scheduler to free jobs while
>>>>> >>> enter to function amdgpu_device_gpu_recover?
>>>>> >>>> Best wishes
>>>>> >>>> Emily Deng
>>>>> >>>>
>>>>> >>>>
>>>>> >>>>
>>>>> >>>>> -----Original Message-----
>>>>> >>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>> >>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>>>>> >gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>> >>>>>
>>>>> >>>>> Hi Emily,
>>>>> >>>>>
>>>>> >>>>> in this case you are on an old code branch.
>>>>> >>>>>
>>>>> >>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>> >>>>> timeout handler is running.
>>>>> >>>>>
>>>>> >>>>> See this patch here:
>>>>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>> >>>>>> Author: Christian König <christian.koenig-5C7GfCeVMHo@public.gmane.org>
>>>>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>> >>>>>>
>>>>> >>>>>>       drm/scheduler: rework job destruction
>>>>> >>>>> Regards,
>>>>> >>>>> Christian.
>>>>> >>>>>
>>>>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>> >>>>>> Hi Christian,
>>>>> >>>>>>         Please refer to follow log, when it enter to
>>>>> >>>>>> amdgpu_device_gpu_recover
>>>>> >>>>> function, the bad job 000000005086879e is freeing in function
>>>>> >>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>>>> >>> signal.
>>>>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>> >>>>> s_fence is already freed, but job is not freed in time. Then this issue
>>>>> >occurs.
>>>>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>>> >>> sdma0
>>>>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>> >>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>>> >>>>> amdgpu
>>>>> >>>>> 0000:00:08.0: GPU reset begin!
>>>>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>> >>>>>> process pid 0 thread pid 0, s_job:000000005086879e [  449.794221]
>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>> >>>>>> thread pid 0,
>>>>> >>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>> >>>>> Emily:amdgpu_job_free_cb,Process
>>>>> >>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>>>>> >>>>> at
>>>>> >>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>> >>>>> 0000 [#1] SMP PTI
>>>>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>> >>>>>> 449.803488]
>>>>> >>> RIP:
>>>>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>>> >>>>>> ff
>>>>> >>>>>> 45 85 e4 0f
>>>>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>> >>>>> <48> 8b
>>>>> >>> 98
>>>>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>> >>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000
>>>>> >CR0:
>>>>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0 CR3:
>>>>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>>> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>> >>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>> >>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>> >>>>>>
>>>>> >>>>>>> -----Original Message-----
>>>>> >>>>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>> >>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>>>>> >>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>> >>>>>>> tdr
>>>>> >>>>>>>
>>>>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>> >>>>>>>> Sorry, please take your time.
>>>>> >>>>>>> Have you seen my other response a bit below?
>>>>> >>>>>>>
>>>>> >>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>> >>>>>>> NULL without the job also being freed.
>>>>> >>>>>>>
>>>>> >>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>> >>>>>>>
>>>>> >>>>>>> Regards,
>>>>> >>>>>>> Christian.
>>>>> >>>>>>>
>>>>> >>>>>>>> Best wishes
>>>>> >>>>>>>> Emily Deng
>>>>> >>>>>>>>
>>>>> >>>>>>>>
>>>>> >>>>>>>>
>>>>> >>>>>>>>> -----Original Message-----
>>>>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>> >>>>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>>>>> >>>>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>> >>>>>>>>> tdr
>>>>> >>>>>>>>>
>>>>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>> >>>>>>>>>> Ping.....
>>>>> >>>>>>>>> You need to give me at least enough time to wake up :)
>>>>> >>>>>>>>>
>>>>> >>>>>>>>>> Best wishes
>>>>> >>>>>>>>>> Emily Deng
>>>>> >>>>>>>>>>
>>>>> >>>>>>>>>>
>>>>> >>>>>>>>>>
>>>>> >>>>>>>>>>> -----Original Message-----
>>>>> >>>>>>>>>>> From: amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> On
>>>>> >>> Behalf
>>>>> >>>>>>>>>>> Of Deng, Emily
>>>>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>; amd-
>>>>> >>>>>>>>>>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>> >>>>>>>>>>> for tdr
>>>>> >>>>>>>>>>>
>>>>> >>>>>>>>>>>> -----Original Message-----
>>>>> >>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>>>>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>;
>>>>> >>>>>>>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>> >>>>>>>>>>>> for tdr
>>>>> >>>>>>>>>>>>
>>>>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>> >>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>> >>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>>>>> >>>>>>>>>>> go to free
>>>>> >>>>> job.
>>>>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>> >>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>> >>>>>>>>> No, that case can't happen. See here:
>>>>> >>>>>>>>>
>>>>> >>>>>>>>>>            drm_sched_job_cleanup(s_job);
>>>>> >>>>>>>>>>
>>>>> >>>>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>> >>>>>>>>>>            dma_fence_put(job->fence);
>>>>> >>>>>>>>>>            amdgpu_sync_free(&job->sync);
>>>>> >>>>>>>>>>            amdgpu_sync_free(&job->sched_sync);
>>>>> >>>>>>>>>>            kfree(job);
>>>>> >>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>> >>>>>>>>> to the
>>>>> >>>>> s_fence.
>>>>> >>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>> >>>>>>>>> patch is a clear NAK.
>>>>> >>>>>>>>>
>>>>> >>>>>>>>> Regards,
>>>>> >>>>>>>>> Christian.
>>>>> >>>>>>>>>
>>>>> >>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>> >>>>>>>>>>>> problem is somewhere else.
>>>>> >>>>>>>>>>>>
>>>>> >>>>>>>>>>>> Regards,
>>>>> >>>>>>>>>>>> Christian.
>>>>> >>>>>>>>>>>>
>>>>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng-5C7GfCeVMHo@public.gmane.org>
>>>>> >>>>>>>>>>>>> ---
>>>>> >>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>> >>>>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>>>>> >--
>>>>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
>>>>> >>>>>>>>>>>>>
>>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>> >>> amdgpu_device_gpu_recover(struct
>>>>> >>>>>>>>>>>> amdgpu_device *adev,
>>>>> >>>>>>>>>>>>>            *
>>>>> >>>>>>>>>>>>>            * job->base holds a reference to parent fence
>>>>> >>>>>>>>>>>>>            */
>>>>> >>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>> >>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>> >>>>>>>>>>>>> + job->base.s_fence->parent
>>>>> >>>>>>> &&
>>>>> >>>>>>>>>>>>>               dma_fence_is_signaled(job->base.s_fence->parent))
>>>>> >>>>>>>>>>>>>                   job_signaled = true;
>>>>> >>>>>>>>>>>>>
>>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>> >drm_sched_increase_karma(struct
>>>>> >>>>>>>>>>>> drm_sched_job
>>>>> >>>>>>>>>>>>> *bad)
>>>>> >>>>>>>>>>>>>
>>>>> >>>>>>>>>>>>>                           spin_lock(&rq->lock);
>>>>> >>>>>>>>>>>>>                           list_for_each_entry_safe(entity,
>>>>> >>>>>>>>>>>>> tmp,
>>>>> >>> &rq-
>>>>> >>>>>>>> entities,
>>>>> >>>>>>>>>>>> list) {
>>>>> >>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
>>>>> >>>>>>> ==
>>>>> >>>>>>>>>>>>> - entity->fence_context) {
>>>>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>> >>>>>>>>>>>>> + (bad->s_fence-
>>>>> >>>>>>>>>>>>> scheduled.context ==
>>>>> >>>>>>>>>>>>> + entity->fence_context)) {
>>>>> >>>>>>>>>>>>>                                           if
>>>>> >>>>>>>>>>>>> (atomic_read(&bad-
>>>>> >>>>>>>> karma) >
>>>>> >>>>>>>>>>>>>                                               bad->sched-
>>>>> >>>> hang_limit)
>>>>> >>>>>>>>>>>>>                                                   if
>>>>> >>>>>>>>>>>>> (entity-
>>>>> >>>> guilty) @@ -376,7 +376,7 @@ void
>>>>> >>>>>>>>>>>>> drm_sched_stop(struct
>>>>> >>>>>>> drm_gpu_scheduler
>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>> >>>>>>>>>>>>>            * This iteration is thread safe as sched thread
>>>>> >>>>>>>>>>>>> is
>>>>> >>> stopped.
>>>>> >>>>>>>>>>>>>            */
>>>>> >>>>>>>>>>>>>           list_for_each_entry_safe_reverse(s_job, tmp,
>>>>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>> >>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
>>>>> >>>>>>>>>>>>>                       dma_fence_remove_callback(s_job-
>>>>> >>>> s_fence-
>>>>> >>>>>>>> parent,
>>>>> >>>>>>>>>>>>>                                                 &s_job->cb)) {
>>>>> >>>>>>>>>>>>>                           atomic_dec(&sched->hw_rq_count);
>>>>> >>> @@ -
>>>>> >>>>>>> 395,7
>>>>> >>>>>>>>>>> +395,8 @@ void
>>>>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>> >>>>>>>>>>>>>                            *
>>>>> >>>>>>>>>>>>>                            * Job is still alive so fence
>>>>> >>>>>>>>>>>>> refcount at
>>>>> >>> least 1
>>>>> >>>>>>>>>>>>>                            */
>>>>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>> >>>>>>> false);
>>>>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>> >>>>>>>> finished,
>>>>> >>>>>>>>>>>> false);
>>>>> >>>>>>>>>>>>>                           /*
>>>>> >>>>>>>>>>>>>                            * We must keep bad job alive
>>>>> >>>>>>>>>>>>> for later
>>>>> >>> use
>>>>> >>>>>>> during @@
>>>>> >>>>>>>>>>>> -438,7
>>>>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>> >>>>> *sched,
>>>>> >>>>>>>>>>>>> +bool
>>>>> >>>>>>>>>>>> full_recovery)
>>>>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>>>>> >>>>>>>>>>>>>            */
>>>>> >>>>>>>>>>>>>           list_for_each_entry_safe(s_job, tmp,
>>>>> >>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>> >>>>>>>>>>>>> node)
>>>>> >>>>>>>>>>>> {
>>>>> >>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
>>>>> >>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ?
>>>>> >>>>>>>>>>>>> + s_job-
>>>>> >>>>>>>> s_fence-
>>>>> >>>>>>>>>>>>> parent :
>>>>> >>>>>>>>>>>>> +NULL;
>>>>> >>>>>>>>>>>>>
>>>>> >>>>>>>>>>>>>                   atomic_inc(&sched->hw_rq_count);
>>>>> >>>>>>>>>>>>>
>>>>> >>>>>>>>>>> _______________________________________________
>>>>> >>>>>>>>>>> amd-gfx mailing list
>>>>> >>>>>>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>> >>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>> >
>>>>> >_______________________________________________
>>>>> >amd-gfx mailing list
>>>>> >amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>


[-- Attachment #1.2: Type: text/html, Size: 65263 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-13 14:20                                                                             ` Christian König
  0 siblings, 0 replies; 80+ messages in thread
From: Christian König @ 2019-11-13 14:20 UTC (permalink / raw)
  To: Andrey Grodzovsky, Deng, Emily, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 25657 bytes --]

Another more fundamental question: Could we get rid of the timeout job 
at all?

I mean we used to give this as parameter to the scheduler callback 
because we had the timeout worker in the job, but that is no longer the 
case.

E.g. in drm_sched_job_timedout() we do the following:
>         job = list_first_entry_or_null(&sched->ring_mirror_list,
>                                        struct drm_sched_job, node);

Why don't we just remove that here and only get the first job after we 
have stopped the scheduler?

Regards,
Christian.

Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>
> This why I asked for a trace with timer enabled, but since there is a 
> finite number of places we touch the timer Emily can just put prints 
> there. Also, I wonder if this temp fix helps her with the issue or not.
>
> Andrey
>
> On 11/13/19 2:36 AM, Christian König wrote:
>> The question is where do we rearm the timer for this problem to occur?
>>
>> Regards,
>> Christian.
>>
>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>
>>> I was able to reproduce the crash by using the attached 
>>> simulate_crash.patch - waiting on guilty job to signal in reset work 
>>> and artificially rearming the timeout timer just before the check 
>>> for !cancel_delayed_work(&sched->work_tdr)  in 
>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I 
>>> think confirms my theory i described earlier in this thread.
>>>
>>> basic_fix.patch handles this by testing whether another timer 
>>> already armed ob this scheduler or is there a timeout work in 
>>> execution right now (see documentation for work_busy) - obviously  
>>> this is not a full solution as this will not protect from races if 
>>> for example there is immediate work scheduling such as in 
>>> drm_sched_fault -  so we probably need to account for this by making 
>>> drm_sched_cleanup_jobs (at least in the part where it iterates ring 
>>> mirror list and frees jobs) and GPU reset really mutually exclusive 
>>> and not like now.
>>>
>>> Andrey
>>>
>>>
>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>> Hi Emily,
>>>>
>>>> you need to print which scheduler instance is freeing the jobs and 
>>>> which one is triggering the reset. The TID and PID is completely 
>>>> meaningless here since we are called from different worker threads 
>>>> and the TID/PID can change on each call.
>>>>
>>>> Apart from that I will look into this a bit deeper when I have time.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>> Hi Christian,
>>>>>     I add the follow print in function drm_sched_cleanup_jobs. 
>>>>> From the log it shows that only use cancel_delayed_work could not 
>>>>> avoid to free job when the sched is in reset. But don’t know 
>>>>> exactly where it is wrong about the driver. Do you have any 
>>>>> suggestion about this?
>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, pid:%lu\n", 
>>>>> current->tgid, current->pid);
>>>>>         /*
>>>>>          * Don't destroy jobs while the timeout worker is running  
>>>>> OR thread
>>>>>          * is being parked and hence assumed to not touch 
>>>>> ring_mirror_list
>>>>>          */
>>>>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>                 return;
>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
>>>>> current->tgid, current->pid);
>>>>> Best wishes
>>>>> Emily Deng
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring 
>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
>>>>> information: process  pid 0 thread pid 0, 
>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information: 
>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, 
>>>>> pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, pid:2253
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information: 
>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, 
>>>>> pid:2262
>>>>> >-----Original Message-----
>>>>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>> >Sent: Tuesday, November 12, 2019 11:28 AM
>>>>> >To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>> ><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>> >
>>>>> >Thinking more about this claim - we assume here that if cancel_delayed_work
>>>>> >returned true it guarantees that timeout work is not running but, it merely
>>>>> >means there was a pending timeout work which was removed from the
>>>>> >workqueue before it's timer elapsed and so it didn't have a chance to be
>>>>> >dequeued and executed, it doesn't cover already executing work. So there is a
>>>>> >possibility where while timeout work started executing another timeout work
>>>>> >already got enqueued (maybe through earlier cleanup jobs or through
>>>>> >drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs
>>>>> >cancel_delayed_work(&sched->work_tdr) will return true even while there is a
>>>>> >timeout job in progress.
>>>>> >Unfortunately we cannot change cancel_delayed_work to
>>>>> >cancel_delayed_work_sync to flush the timeout work as timeout work itself
>>>>> >waits for schedule thread  to be parked again when calling park_thread.
>>>>> >
>>>>> >Andrey
>>>>> >
>>>>> >________________________________________
>>>>> >From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of
>>>>> >Koenig, Christian <Christian.Koenig@amd.com>
>>>>> >Sent: 08 November 2019 05:35:18
>>>>> >To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>> >
>>>>> >Hi Emily,
>>>>> >
>>>>> >exactly that can't happen. See here:
>>>>> >
>>>>> >>         /* Don't destroy jobs while the timeout worker is running */
>>>>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>> >>            !cancel_delayed_work(&sched->work_tdr))
>>>>> >>                 return NULL;
>>>>> >
>>>>> >We never free jobs while the timeout working is running to prevent exactly
>>>>> >that issue.
>>>>> >
>>>>> >Regards,
>>>>> >Christian.
>>>>> >
>>>>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>> >> Hi Christian,
>>>>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>>> >amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>>>>> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>>> >>
>>>>> >> Best wishes
>>>>> >> Emily Deng
>>>>> >>
>>>>> >>
>>>>> >>
>>>>> >>> -----Original Message-----
>>>>> >>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> >>> Sent: Friday, November 8, 2019 6:26 PM
>>>>> >>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>> >>>
>>>>> >>> Hi Emily,
>>>>> >>>
>>>>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>> >>>
>>>>> >>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>>> >>>
>>>>> >>> Regards,
>>>>> >>> Christian.
>>>>> >>>
>>>>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>> >>>> Hi Chrisitan,
>>>>> >>>>        No, I am with the new branch and also has the patch. Even it
>>>>> >>>> are freed by
>>>>> >>> main scheduler, how we could avoid main scheduler to free jobs while
>>>>> >>> enter to function amdgpu_device_gpu_recover?
>>>>> >>>> Best wishes
>>>>> >>>> Emily Deng
>>>>> >>>>
>>>>> >>>>
>>>>> >>>>
>>>>> >>>>> -----Original Message-----
>>>>> >>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>> >>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>> >gfx@lists.freedesktop.org
>>>>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>> >>>>>
>>>>> >>>>> Hi Emily,
>>>>> >>>>>
>>>>> >>>>> in this case you are on an old code branch.
>>>>> >>>>>
>>>>> >>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>> >>>>> timeout handler is running.
>>>>> >>>>>
>>>>> >>>>> See this patch here:
>>>>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>> >>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>> >>>>>>
>>>>> >>>>>>       drm/scheduler: rework job destruction
>>>>> >>>>> Regards,
>>>>> >>>>> Christian.
>>>>> >>>>>
>>>>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>> >>>>>> Hi Christian,
>>>>> >>>>>>         Please refer to follow log, when it enter to
>>>>> >>>>>> amdgpu_device_gpu_recover
>>>>> >>>>> function, the bad job 000000005086879e is freeing in function
>>>>> >>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>>>> >>> signal.
>>>>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>> >>>>> s_fence is already freed, but job is not freed in time. Then this issue
>>>>> >occurs.
>>>>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>>> >>> sdma0
>>>>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>> >>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>>> >>>>> amdgpu
>>>>> >>>>> 0000:00:08.0: GPU reset begin!
>>>>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>> >>>>>> process pid 0 thread pid 0, s_job:000000005086879e [  449.794221]
>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>> >>>>>> thread pid 0,
>>>>> >>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>> >>>>> Emily:amdgpu_job_free_cb,Process
>>>>> >>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>>>>> >>>>> at
>>>>> >>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>> >>>>> 0000 [#1] SMP PTI
>>>>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>> >>>>>> 449.803488]
>>>>> >>> RIP:
>>>>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>>> >>>>>> ff
>>>>> >>>>>> 45 85 e4 0f
>>>>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>> >>>>> <48> 8b
>>>>> >>> 98
>>>>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>> >>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000
>>>>> >CR0:
>>>>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0 CR3:
>>>>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>>> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>> >>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>> >>>>>> worker_thread+0x34/0x410 [  449.814728]  kthread+0x121/0x140 [
>>>>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>> >>>>>>
>>>>> >>>>>>> -----Original Message-----
>>>>> >>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>> >>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>> >>> gfx@lists.freedesktop.org
>>>>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>> >>>>>>> tdr
>>>>> >>>>>>>
>>>>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>> >>>>>>>> Sorry, please take your time.
>>>>> >>>>>>> Have you seen my other response a bit below?
>>>>> >>>>>>>
>>>>> >>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>> >>>>>>> NULL without the job also being freed.
>>>>> >>>>>>>
>>>>> >>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>> >>>>>>>
>>>>> >>>>>>> Regards,
>>>>> >>>>>>> Christian.
>>>>> >>>>>>>
>>>>> >>>>>>>> Best wishes
>>>>> >>>>>>>> Emily Deng
>>>>> >>>>>>>>
>>>>> >>>>>>>>
>>>>> >>>>>>>>
>>>>> >>>>>>>>> -----Original Message-----
>>>>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>> >>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>> >>>>> gfx@lists.freedesktop.org
>>>>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>> >>>>>>>>> tdr
>>>>> >>>>>>>>>
>>>>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>> >>>>>>>>>> Ping.....
>>>>> >>>>>>>>> You need to give me at least enough time to wake up :)
>>>>> >>>>>>>>>
>>>>> >>>>>>>>>> Best wishes
>>>>> >>>>>>>>>> Emily Deng
>>>>> >>>>>>>>>>
>>>>> >>>>>>>>>>
>>>>> >>>>>>>>>>
>>>>> >>>>>>>>>>> -----Original Message-----
>>>>> >>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>>>> >>> Behalf
>>>>> >>>>>>>>>>> Of Deng, Emily
>>>>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>> >>>>>>>>>>> gfx@lists.freedesktop.org
>>>>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>> >>>>>>>>>>> for tdr
>>>>> >>>>>>>>>>>
>>>>> >>>>>>>>>>>> -----Original Message-----
>>>>> >>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>> >>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>> >>>>>>>>>>>> for tdr
>>>>> >>>>>>>>>>>>
>>>>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>> >>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>> >>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>>>>> >>>>>>>>>>> go to free
>>>>> >>>>> job.
>>>>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>> >>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>> >>>>>>>>> No, that case can't happen. See here:
>>>>> >>>>>>>>>
>>>>> >>>>>>>>>>            drm_sched_job_cleanup(s_job);
>>>>> >>>>>>>>>>
>>>>> >>>>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>> >>>>>>>>>>            dma_fence_put(job->fence);
>>>>> >>>>>>>>>>            amdgpu_sync_free(&job->sync);
>>>>> >>>>>>>>>>            amdgpu_sync_free(&job->sched_sync);
>>>>> >>>>>>>>>>            kfree(job);
>>>>> >>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>> >>>>>>>>> to the
>>>>> >>>>> s_fence.
>>>>> >>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>> >>>>>>>>> patch is a clear NAK.
>>>>> >>>>>>>>>
>>>>> >>>>>>>>> Regards,
>>>>> >>>>>>>>> Christian.
>>>>> >>>>>>>>>
>>>>> >>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>> >>>>>>>>>>>> problem is somewhere else.
>>>>> >>>>>>>>>>>>
>>>>> >>>>>>>>>>>> Regards,
>>>>> >>>>>>>>>>>> Christian.
>>>>> >>>>>>>>>>>>
>>>>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>> >>>>>>>>>>>>> ---
>>>>> >>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>> >>>>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>>>>> >--
>>>>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
>>>>> >>>>>>>>>>>>>
>>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>> >>> amdgpu_device_gpu_recover(struct
>>>>> >>>>>>>>>>>> amdgpu_device *adev,
>>>>> >>>>>>>>>>>>>            *
>>>>> >>>>>>>>>>>>>            * job->base holds a reference to parent fence
>>>>> >>>>>>>>>>>>>            */
>>>>> >>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>> >>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>> >>>>>>>>>>>>> + job->base.s_fence->parent
>>>>> >>>>>>> &&
>>>>> >>>>>>>>>>>>>               dma_fence_is_signaled(job->base.s_fence->parent))
>>>>> >>>>>>>>>>>>>                   job_signaled = true;
>>>>> >>>>>>>>>>>>>
>>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>> >drm_sched_increase_karma(struct
>>>>> >>>>>>>>>>>> drm_sched_job
>>>>> >>>>>>>>>>>>> *bad)
>>>>> >>>>>>>>>>>>>
>>>>> >>>>>>>>>>>>>                           spin_lock(&rq->lock);
>>>>> >>>>>>>>>>>>>                           list_for_each_entry_safe(entity,
>>>>> >>>>>>>>>>>>> tmp,
>>>>> >>> &rq-
>>>>> >>>>>>>> entities,
>>>>> >>>>>>>>>>>> list) {
>>>>> >>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
>>>>> >>>>>>> ==
>>>>> >>>>>>>>>>>>> - entity->fence_context) {
>>>>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>> >>>>>>>>>>>>> + (bad->s_fence-
>>>>> >>>>>>>>>>>>> scheduled.context ==
>>>>> >>>>>>>>>>>>> + entity->fence_context)) {
>>>>> >>>>>>>>>>>>>                                           if
>>>>> >>>>>>>>>>>>> (atomic_read(&bad-
>>>>> >>>>>>>> karma) >
>>>>> >>>>>>>>>>>>>                                               bad->sched-
>>>>> >>>> hang_limit)
>>>>> >>>>>>>>>>>>>                                                   if
>>>>> >>>>>>>>>>>>> (entity-
>>>>> >>>> guilty) @@ -376,7 +376,7 @@ void
>>>>> >>>>>>>>>>>>> drm_sched_stop(struct
>>>>> >>>>>>> drm_gpu_scheduler
>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>> >>>>>>>>>>>>>            * This iteration is thread safe as sched thread
>>>>> >>>>>>>>>>>>> is
>>>>> >>> stopped.
>>>>> >>>>>>>>>>>>>            */
>>>>> >>>>>>>>>>>>>           list_for_each_entry_safe_reverse(s_job, tmp,
>>>>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>> >>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
>>>>> >>>>>>>>>>>>>                       dma_fence_remove_callback(s_job-
>>>>> >>>> s_fence-
>>>>> >>>>>>>> parent,
>>>>> >>>>>>>>>>>>>                                                 &s_job->cb)) {
>>>>> >>>>>>>>>>>>>                           atomic_dec(&sched->hw_rq_count);
>>>>> >>> @@ -
>>>>> >>>>>>> 395,7
>>>>> >>>>>>>>>>> +395,8 @@ void
>>>>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>> >>>>>>>>>>>>>                            *
>>>>> >>>>>>>>>>>>>                            * Job is still alive so fence
>>>>> >>>>>>>>>>>>> refcount at
>>>>> >>> least 1
>>>>> >>>>>>>>>>>>>                            */
>>>>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>> >>>>>>> false);
>>>>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>> >>>>>>>> finished,
>>>>> >>>>>>>>>>>> false);
>>>>> >>>>>>>>>>>>>                           /*
>>>>> >>>>>>>>>>>>>                            * We must keep bad job alive
>>>>> >>>>>>>>>>>>> for later
>>>>> >>> use
>>>>> >>>>>>> during @@
>>>>> >>>>>>>>>>>> -438,7
>>>>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>> >>>>> *sched,
>>>>> >>>>>>>>>>>>> +bool
>>>>> >>>>>>>>>>>> full_recovery)
>>>>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>>>>> >>>>>>>>>>>>>            */
>>>>> >>>>>>>>>>>>>           list_for_each_entry_safe(s_job, tmp,
>>>>> >>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>> >>>>>>>>>>>>> node)
>>>>> >>>>>>>>>>>> {
>>>>> >>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
>>>>> >>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ?
>>>>> >>>>>>>>>>>>> + s_job-
>>>>> >>>>>>>> s_fence-
>>>>> >>>>>>>>>>>>> parent :
>>>>> >>>>>>>>>>>>> +NULL;
>>>>> >>>>>>>>>>>>>
>>>>> >>>>>>>>>>>>>                   atomic_inc(&sched->hw_rq_count);
>>>>> >>>>>>>>>>>>>
>>>>> >>>>>>>>>>> _______________________________________________
>>>>> >>>>>>>>>>> amd-gfx mailing list
>>>>> >>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>> >>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>> >
>>>>> >_______________________________________________
>>>>> >amd-gfx mailing list
>>>>> >amd-gfx@lists.freedesktop.org
>>>>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>


[-- Attachment #1.2: Type: text/html, Size: 63678 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-13 16:00                                                                                 ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-13 16:00 UTC (permalink / raw)
  To: Christian König, Deng, Emily,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 27828 bytes --]


On 11/13/19 9:20 AM, Christian König wrote:
> Another more fundamental question: Could we get rid of the timeout job 
> at all?


There are other stuff there besides picking the first unfinished job 
which is common for all the drivers - such as freeing guilty signaled 
job and rearming the timeout work timer.


>
> I mean we used to give this as parameter to the scheduler callback 
> because we had the timeout worker in the job, but that is no longer 
> the case.
>
> E.g. in drm_sched_job_timedout() we do the following:
>>         job = list_first_entry_or_null(&sched->ring_mirror_list,
>>                                        struct drm_sched_job, node);
>
> Why don't we just remove that here and only get the first job after we 
> have stopped the scheduler?


Should be ok since we have the extra check for __kthread_should_park in 
drm_sched_cleanup_jobs which will protect us in this case from a wakeup 
of sched thread and execution of in drm_sched_cleanup_jobs after we 
already parked it. The problem here is we need the drm_sched_job to 
access the private data for each client driver (see amdgpu_job_timedout 
for example). What about instead of peeking at the job to actually 
remove it from ring_mirror_list right there, go ahead with it through 
the reset routine, if it's signaled in the meanwhile that great - 
release it, otherwise put it back into ring_mirror_list in 
drm_sched_resubmit_jobs.

Andrey


>
> Regards,
> Christian.
>
> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>
>> This why I asked for a trace with timer enabled, but since there is a 
>> finite number of places we touch the timer Emily can just put prints 
>> there. Also, I wonder if this temp fix helps her with the issue or not.
>>
>> Andrey
>>
>> On 11/13/19 2:36 AM, Christian König wrote:
>>> The question is where do we rearm the timer for this problem to occur?
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>
>>>> I was able to reproduce the crash by using the attached 
>>>> simulate_crash.patch - waiting on guilty job to signal in reset 
>>>> work and artificially rearming the timeout timer just before the 
>>>> check for !cancel_delayed_work(&sched->work_tdr)  in 
>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I 
>>>> think confirms my theory i described earlier in this thread.
>>>>
>>>> basic_fix.patch handles this by testing whether another timer 
>>>> already armed ob this scheduler or is there a timeout work in 
>>>> execution right now (see documentation for work_busy) - obviously  
>>>> this is not a full solution as this will not protect from races if 
>>>> for example there is immediate work scheduling such as in 
>>>> drm_sched_fault -  so we probably need to account for this by 
>>>> making drm_sched_cleanup_jobs (at least in the part where it 
>>>> iterates ring mirror list and frees jobs) and GPU reset really 
>>>> mutually exclusive and not like now.
>>>>
>>>> Andrey
>>>>
>>>>
>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>> Hi Emily,
>>>>>
>>>>> you need to print which scheduler instance is freeing the jobs and 
>>>>> which one is triggering the reset. The TID and PID is completely 
>>>>> meaningless here since we are called from different worker threads 
>>>>> and the TID/PID can change on each call.
>>>>>
>>>>> Apart from that I will look into this a bit deeper when I have time.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>> Hi Christian,
>>>>>>     I add the follow print in function drm_sched_cleanup_jobs. 
>>>>>> From the log it shows that only use cancel_delayed_work could not 
>>>>>> avoid to free job when the sched is in reset. But don’t know 
>>>>>> exactly where it is wrong about the driver. Do you have any 
>>>>>> suggestion about this?
>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, pid:%lu\n", 
>>>>>> current->tgid, current->pid);
>>>>>>         /*
>>>>>>          * Don't destroy jobs while the timeout worker is 
>>>>>> running  OR thread
>>>>>>          * is being parked and hence assumed to not touch 
>>>>>> ring_mirror_list
>>>>>>          */
>>>>>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>                 return;
>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
>>>>>> current->tgid, current->pid);
>>>>>> Best wishes
>>>>>> Emily Deng
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring 
>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
>>>>>> information: process  pid 0 thread  pid 0, 
>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information: 
>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, 
>>>>>> pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, pid:2253
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information: 
>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, 
>>>>>> pid:2262
>>>>>> >-----Original Message-----
>>>>>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
>>>>>> >Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>> >To: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>; Deng, Emily
>>>>>> ><Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>> >
>>>>>> >Thinking more about this claim - we assume here that if cancel_delayed_work
>>>>>> >returned true it guarantees that timeout work is not running but, it merely
>>>>>> >means there was a pending timeout work which was removed from the
>>>>>> >workqueue before it's timer elapsed and so it didn't have a chance to be
>>>>>> >dequeued and executed, it doesn't cover already executing work. So there is a
>>>>>> >possibility where while timeout work started executing another timeout work
>>>>>> >already got enqueued (maybe through earlier cleanup jobs or through
>>>>>> >drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs
>>>>>> >cancel_delayed_work(&sched->work_tdr) will return true even while there is a
>>>>>> >timeout job in progress.
>>>>>> >Unfortunately we cannot change cancel_delayed_work to
>>>>>> >cancel_delayed_work_sync to flush the timeout work as timeout work itself
>>>>>> >waits for schedule thread  to be parked again when calling park_thread.
>>>>>> >
>>>>>> >Andrey
>>>>>> >
>>>>>> >________________________________________
>>>>>> >From: amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> on behalf of
>>>>>> >Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>>> >Sent: 08 November 2019 05:35:18
>>>>>> >To: Deng, Emily; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>> >
>>>>>> >Hi Emily,
>>>>>> >
>>>>>> >exactly that can't happen. See here:
>>>>>> >
>>>>>> >>         /* Don't destroy jobs while the timeout worker is running */
>>>>>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>> >>            !cancel_delayed_work(&sched->work_tdr))
>>>>>> >>                 return NULL;
>>>>>> >
>>>>>> >We never free jobs while the timeout working is running to prevent exactly
>>>>>> >that issue.
>>>>>> >
>>>>>> >Regards,
>>>>>> >Christian.
>>>>>> >
>>>>>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>> >> Hi Christian,
>>>>>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>>>> >amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>>>>>> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>>>> >>
>>>>>> >> Best wishes
>>>>>> >> Emily Deng
>>>>>> >>
>>>>>> >>
>>>>>> >>
>>>>>> >>> -----Original Message-----
>>>>>> >>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>>> >>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>> >>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>> >>>
>>>>>> >>> Hi Emily,
>>>>>> >>>
>>>>>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>> >>>
>>>>>> >>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>>>> >>>
>>>>>> >>> Regards,
>>>>>> >>> Christian.
>>>>>> >>>
>>>>>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>> >>>> Hi Chrisitan,
>>>>>> >>>>        No, I am with the new branch and also has the patch. Even it
>>>>>> >>>> are freed by
>>>>>> >>> main scheduler, how we could avoid main scheduler to free jobs while
>>>>>> >>> enter to function amdgpu_device_gpu_recover?
>>>>>> >>>> Best wishes
>>>>>> >>>> Emily Deng
>>>>>> >>>>
>>>>>> >>>>
>>>>>> >>>>
>>>>>> >>>>> -----Original Message-----
>>>>>> >>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>> >>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>>>>>> >gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>> >>>>>
>>>>>> >>>>> Hi Emily,
>>>>>> >>>>>
>>>>>> >>>>> in this case you are on an old code branch.
>>>>>> >>>>>
>>>>>> >>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>>> >>>>> timeout handler is running.
>>>>>> >>>>>
>>>>>> >>>>> See this patch here:
>>>>>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>> >>>>>> Author: Christian König <christian.koenig-5C7GfCeVMHo@public.gmane.org>
>>>>>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>> >>>>>>
>>>>>> >>>>>>       drm/scheduler: rework job destruction
>>>>>> >>>>> Regards,
>>>>>> >>>>> Christian.
>>>>>> >>>>>
>>>>>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>> >>>>>> Hi Christian,
>>>>>> >>>>>>         Please refer to follow log, when it enter to
>>>>>> >>>>>> amdgpu_device_gpu_recover
>>>>>> >>>>> function, the bad job 000000005086879e is freeing in function
>>>>>> >>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>>>>> >>> signal.
>>>>>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>> >>>>> s_fence is already freed, but job is not freed in time. Then this issue
>>>>>> >occurs.
>>>>>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>>>> >>> sdma0
>>>>>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>>> >>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>>>> >>>>> amdgpu
>>>>>> >>>>> 0000:00:08.0: GPU reset begin!
>>>>>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>> >>>>>> process pid 0 thread pid 0, s_job:000000005086879e [  449.794221]
>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> >>>>>> thread pid 0,
>>>>>> >>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>> >>>>> Emily:amdgpu_job_free_cb,Process
>>>>>> >>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>>>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>>>>>> >>>>> at
>>>>>> >>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>>> >>>>> 0000 [#1] SMP PTI
>>>>>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>>>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>> >>>>>> 449.803488]
>>>>>> >>> RIP:
>>>>>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>>>> >>>>>> ff
>>>>>> >>>>>> 45 85 e4 0f
>>>>>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>>> >>>>> <48> 8b
>>>>>> >>> 98
>>>>>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>>> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>> >>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000
>>>>>> >CR0:
>>>>>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0 CR3:
>>>>>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>>>> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>> >>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>> >>>>>> worker_thread+0x34/0x410 [  449.814728] kthread+0x121/0x140 [
>>>>>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>> >>>>>>
>>>>>> >>>>>>> -----Original Message-----
>>>>>> >>>>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>> >>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>>>>>> >>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>> >>>>>>> tdr
>>>>>> >>>>>>>
>>>>>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>> >>>>>>>> Sorry, please take your time.
>>>>>> >>>>>>> Have you seen my other response a bit below?
>>>>>> >>>>>>>
>>>>>> >>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>> >>>>>>> NULL without the job also being freed.
>>>>>> >>>>>>>
>>>>>> >>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>> >>>>>>>
>>>>>> >>>>>>> Regards,
>>>>>> >>>>>>> Christian.
>>>>>> >>>>>>>
>>>>>> >>>>>>>> Best wishes
>>>>>> >>>>>>>> Emily Deng
>>>>>> >>>>>>>>
>>>>>> >>>>>>>>
>>>>>> >>>>>>>>
>>>>>> >>>>>>>>> -----Original Message-----
>>>>>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>> >>>>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>>>>>> >>>>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>> >>>>>>>>> tdr
>>>>>> >>>>>>>>>
>>>>>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>> >>>>>>>>>> Ping.....
>>>>>> >>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>> >>>>>>>>>
>>>>>> >>>>>>>>>> Best wishes
>>>>>> >>>>>>>>>> Emily Deng
>>>>>> >>>>>>>>>>
>>>>>> >>>>>>>>>>
>>>>>> >>>>>>>>>>
>>>>>> >>>>>>>>>>> -----Original Message-----
>>>>>> >>>>>>>>>>> From: amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> On
>>>>>> >>> Behalf
>>>>>> >>>>>>>>>>> Of Deng, Emily
>>>>>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>; amd-
>>>>>> >>>>>>>>>>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>> >>>>>>>>>>> for tdr
>>>>>> >>>>>>>>>>>
>>>>>> >>>>>>>>>>>> -----Original Message-----
>>>>>> >>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>>>>>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>;
>>>>>> >>>>>>>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>> >>>>>>>>>>>> for tdr
>>>>>> >>>>>>>>>>>>
>>>>>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>> >>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>> >>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>>>>>> >>>>>>>>>>> go to free
>>>>>> >>>>> job.
>>>>>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>> >>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>> >>>>>>>>> No, that case can't happen. See here:
>>>>>> >>>>>>>>>
>>>>>> >>>>>>>>>>            drm_sched_job_cleanup(s_job);
>>>>>> >>>>>>>>>>
>>>>>> >>>>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>> >>>>>>>>>>            dma_fence_put(job->fence);
>>>>>> >>>>>>>>>>            amdgpu_sync_free(&job->sync);
>>>>>> >>>>>>>>>>            amdgpu_sync_free(&job->sched_sync);
>>>>>> >>>>>>>>>>            kfree(job);
>>>>>> >>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>> >>>>>>>>> to the
>>>>>> >>>>> s_fence.
>>>>>> >>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>> >>>>>>>>> patch is a clear NAK.
>>>>>> >>>>>>>>>
>>>>>> >>>>>>>>> Regards,
>>>>>> >>>>>>>>> Christian.
>>>>>> >>>>>>>>>
>>>>>> >>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>> >>>>>>>>>>>> problem is somewhere else.
>>>>>> >>>>>>>>>>>>
>>>>>> >>>>>>>>>>>> Regards,
>>>>>> >>>>>>>>>>>> Christian.
>>>>>> >>>>>>>>>>>>
>>>>>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng-5C7GfCeVMHo@public.gmane.org>
>>>>>> >>>>>>>>>>>>> ---
>>>>>> >>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>> >>>>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>>>>>> >--
>>>>>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>> >>>>>>>>>>>>>
>>>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>> >>> amdgpu_device_gpu_recover(struct
>>>>>> >>>>>>>>>>>> amdgpu_device *adev,
>>>>>> >>>>>>>>>>>>>            *
>>>>>> >>>>>>>>>>>>>            * job->base holds a reference to parent fence
>>>>>> >>>>>>>>>>>>>            */
>>>>>> >>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>> >>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>> >>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>> >>>>>>> &&
>>>>>> >>>>>>>>>>>>>               dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>> >>>>>>>>>>>>>                   job_signaled = true;
>>>>>> >>>>>>>>>>>>>
>>>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>> >drm_sched_increase_karma(struct
>>>>>> >>>>>>>>>>>> drm_sched_job
>>>>>> >>>>>>>>>>>>> *bad)
>>>>>> >>>>>>>>>>>>>
>>>>>> >>>>>>>>>>>>>                           spin_lock(&rq->lock);
>>>>>> >>>>>>>>>>>>>                           list_for_each_entry_safe(entity,
>>>>>> >>>>>>>>>>>>> tmp,
>>>>>> >>> &rq-
>>>>>> >>>>>>>> entities,
>>>>>> >>>>>>>>>>>> list) {
>>>>>> >>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
>>>>>> >>>>>>> ==
>>>>>> >>>>>>>>>>>>> - entity->fence_context) {
>>>>>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>> >>>>>>>>>>>>> + (bad->s_fence-
>>>>>> >>>>>>>>>>>>> scheduled.context ==
>>>>>> >>>>>>>>>>>>> + entity->fence_context)) {
>>>>>> >>>>>>>>>>>>>                                           if
>>>>>> >>>>>>>>>>>>> (atomic_read(&bad-
>>>>>> >>>>>>>> karma) >
>>>>>> >>>>>>>>>>>>>                                               bad->sched-
>>>>>> >>>> hang_limit)
>>>>>> >>>>>>>>>>>>>                                                   if
>>>>>> >>>>>>>>>>>>> (entity-
>>>>>> >>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct
>>>>>> >>>>>>> drm_gpu_scheduler
>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>> >>>>>>>>>>>>>            * This iteration is thread safe as sched thread
>>>>>> >>>>>>>>>>>>> is
>>>>>> >>> stopped.
>>>>>> >>>>>>>>>>>>>            */
>>>>>> >>>>>>>>>>>>>           list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>> >>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>> >>>>>>>>>>>>>                       dma_fence_remove_callback(s_job-
>>>>>> >>>> s_fence-
>>>>>> >>>>>>>> parent,
>>>>>> >>>>>>>>>>>>>                                                 &s_job->cb)) {
>>>>>> >>>>>>>>>>>>>                           atomic_dec(&sched->hw_rq_count);
>>>>>> >>> @@ -
>>>>>> >>>>>>> 395,7
>>>>>> >>>>>>>>>>> +395,8 @@ void
>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>> >>>>>>>>>>>>>                            *
>>>>>> >>>>>>>>>>>>>                            * Job is still alive so fence
>>>>>> >>>>>>>>>>>>> refcount at
>>>>>> >>> least 1
>>>>>> >>>>>>>>>>>>>                            */
>>>>>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>> >>>>>>> false);
>>>>>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>> >>>>>>>> finished,
>>>>>> >>>>>>>>>>>> false);
>>>>>> >>>>>>>>>>>>>                           /*
>>>>>> >>>>>>>>>>>>>                            * We must keep bad job alive
>>>>>> >>>>>>>>>>>>> for later
>>>>>> >>> use
>>>>>> >>>>>>> during @@
>>>>>> >>>>>>>>>>>> -438,7
>>>>>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>>> >>>>> *sched,
>>>>>> >>>>>>>>>>>>> +bool
>>>>>> >>>>>>>>>>>> full_recovery)
>>>>>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>>>>>> >>>>>>>>>>>>>            */
>>>>>> >>>>>>>>>>>>>           list_for_each_entry_safe(s_job, tmp,
>>>>>> >>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>> >>>>>>>>>>>>> node)
>>>>>> >>>>>>>>>>>> {
>>>>>> >>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
>>>>>> >>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ?
>>>>>> >>>>>>>>>>>>> + s_job-
>>>>>> >>>>>>>> s_fence-
>>>>>> >>>>>>>>>>>>> parent :
>>>>>> >>>>>>>>>>>>> +NULL;
>>>>>> >>>>>>>>>>>>>
>>>>>> >>>>>>>>>>>>>                   atomic_inc(&sched->hw_rq_count);
>>>>>> >>>>>>>>>>>>>
>>>>>> >>>>>>>>>>> _______________________________________________
>>>>>> >>>>>>>>>>> amd-gfx mailing list
>>>>>> >>>>>>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>> >>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>> >
>>>>>> >_______________________________________________
>>>>>> >amd-gfx mailing list
>>>>>> >amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>
>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>
>

[-- Attachment #1.2: Type: text/html, Size: 69187 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-13 16:00                                                                                 ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-13 16:00 UTC (permalink / raw)
  To: Christian König, Deng, Emily, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 27098 bytes --]


On 11/13/19 9:20 AM, Christian König wrote:
> Another more fundamental question: Could we get rid of the timeout job 
> at all?


There are other stuff there besides picking the first unfinished job 
which is common for all the drivers - such as freeing guilty signaled 
job and rearming the timeout work timer.


>
> I mean we used to give this as parameter to the scheduler callback 
> because we had the timeout worker in the job, but that is no longer 
> the case.
>
> E.g. in drm_sched_job_timedout() we do the following:
>>         job = list_first_entry_or_null(&sched->ring_mirror_list,
>>                                        struct drm_sched_job, node);
>
> Why don't we just remove that here and only get the first job after we 
> have stopped the scheduler?


Should be ok since we have the extra check for __kthread_should_park in 
drm_sched_cleanup_jobs which will protect us in this case from a wakeup 
of sched thread and execution of in drm_sched_cleanup_jobs after we 
already parked it. The problem here is we need the drm_sched_job to 
access the private data for each client driver (see amdgpu_job_timedout 
for example). What about instead of peeking at the job to actually 
remove it from ring_mirror_list right there, go ahead with it through 
the reset routine, if it's signaled in the meanwhile that great - 
release it, otherwise put it back into ring_mirror_list in 
drm_sched_resubmit_jobs.

Andrey


>
> Regards,
> Christian.
>
> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>
>> This why I asked for a trace with timer enabled, but since there is a 
>> finite number of places we touch the timer Emily can just put prints 
>> there. Also, I wonder if this temp fix helps her with the issue or not.
>>
>> Andrey
>>
>> On 11/13/19 2:36 AM, Christian König wrote:
>>> The question is where do we rearm the timer for this problem to occur?
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>
>>>> I was able to reproduce the crash by using the attached 
>>>> simulate_crash.patch - waiting on guilty job to signal in reset 
>>>> work and artificially rearming the timeout timer just before the 
>>>> check for !cancel_delayed_work(&sched->work_tdr)  in 
>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I 
>>>> think confirms my theory i described earlier in this thread.
>>>>
>>>> basic_fix.patch handles this by testing whether another timer 
>>>> already armed ob this scheduler or is there a timeout work in 
>>>> execution right now (see documentation for work_busy) - obviously  
>>>> this is not a full solution as this will not protect from races if 
>>>> for example there is immediate work scheduling such as in 
>>>> drm_sched_fault -  so we probably need to account for this by 
>>>> making drm_sched_cleanup_jobs (at least in the part where it 
>>>> iterates ring mirror list and frees jobs) and GPU reset really 
>>>> mutually exclusive and not like now.
>>>>
>>>> Andrey
>>>>
>>>>
>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>> Hi Emily,
>>>>>
>>>>> you need to print which scheduler instance is freeing the jobs and 
>>>>> which one is triggering the reset. The TID and PID is completely 
>>>>> meaningless here since we are called from different worker threads 
>>>>> and the TID/PID can change on each call.
>>>>>
>>>>> Apart from that I will look into this a bit deeper when I have time.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>> Hi Christian,
>>>>>>     I add the follow print in function drm_sched_cleanup_jobs. 
>>>>>> From the log it shows that only use cancel_delayed_work could not 
>>>>>> avoid to free job when the sched is in reset. But don’t know 
>>>>>> exactly where it is wrong about the driver. Do you have any 
>>>>>> suggestion about this?
>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, pid:%lu\n", 
>>>>>> current->tgid, current->pid);
>>>>>>         /*
>>>>>>          * Don't destroy jobs while the timeout worker is 
>>>>>> running  OR thread
>>>>>>          * is being parked and hence assumed to not touch 
>>>>>> ring_mirror_list
>>>>>>          */
>>>>>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>                 return;
>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
>>>>>> current->tgid, current->pid);
>>>>>> Best wishes
>>>>>> Emily Deng
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring 
>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
>>>>>> information: process  pid 0 thread  pid 0, 
>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information: 
>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, 
>>>>>> pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, pid:2253
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information: 
>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, 
>>>>>> pid:2262
>>>>>> >-----Original Message-----
>>>>>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>> >Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>> >To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>> ><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>> >
>>>>>> >Thinking more about this claim - we assume here that if cancel_delayed_work
>>>>>> >returned true it guarantees that timeout work is not running but, it merely
>>>>>> >means there was a pending timeout work which was removed from the
>>>>>> >workqueue before it's timer elapsed and so it didn't have a chance to be
>>>>>> >dequeued and executed, it doesn't cover already executing work. So there is a
>>>>>> >possibility where while timeout work started executing another timeout work
>>>>>> >already got enqueued (maybe through earlier cleanup jobs or through
>>>>>> >drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs
>>>>>> >cancel_delayed_work(&sched->work_tdr) will return true even while there is a
>>>>>> >timeout job in progress.
>>>>>> >Unfortunately we cannot change cancel_delayed_work to
>>>>>> >cancel_delayed_work_sync to flush the timeout work as timeout work itself
>>>>>> >waits for schedule thread  to be parked again when calling park_thread.
>>>>>> >
>>>>>> >Andrey
>>>>>> >
>>>>>> >________________________________________
>>>>>> >From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of
>>>>>> >Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> >Sent: 08 November 2019 05:35:18
>>>>>> >To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>> >
>>>>>> >Hi Emily,
>>>>>> >
>>>>>> >exactly that can't happen. See here:
>>>>>> >
>>>>>> >>         /* Don't destroy jobs while the timeout worker is running */
>>>>>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>> >>            !cancel_delayed_work(&sched->work_tdr))
>>>>>> >>                 return NULL;
>>>>>> >
>>>>>> >We never free jobs while the timeout working is running to prevent exactly
>>>>>> >that issue.
>>>>>> >
>>>>>> >Regards,
>>>>>> >Christian.
>>>>>> >
>>>>>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>> >> Hi Christian,
>>>>>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>>>> >amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>>>>>> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>>>> >>
>>>>>> >> Best wishes
>>>>>> >> Emily Deng
>>>>>> >>
>>>>>> >>
>>>>>> >>
>>>>>> >>> -----Original Message-----
>>>>>> >>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> >>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>> >>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>> >>>
>>>>>> >>> Hi Emily,
>>>>>> >>>
>>>>>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>> >>>
>>>>>> >>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>>>> >>>
>>>>>> >>> Regards,
>>>>>> >>> Christian.
>>>>>> >>>
>>>>>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>> >>>> Hi Chrisitan,
>>>>>> >>>>        No, I am with the new branch and also has the patch. Even it
>>>>>> >>>> are freed by
>>>>>> >>> main scheduler, how we could avoid main scheduler to free jobs while
>>>>>> >>> enter to function amdgpu_device_gpu_recover?
>>>>>> >>>> Best wishes
>>>>>> >>>> Emily Deng
>>>>>> >>>>
>>>>>> >>>>
>>>>>> >>>>
>>>>>> >>>>> -----Original Message-----
>>>>>> >>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>> >>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>> >gfx@lists.freedesktop.org
>>>>>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>> >>>>>
>>>>>> >>>>> Hi Emily,
>>>>>> >>>>>
>>>>>> >>>>> in this case you are on an old code branch.
>>>>>> >>>>>
>>>>>> >>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>>> >>>>> timeout handler is running.
>>>>>> >>>>>
>>>>>> >>>>> See this patch here:
>>>>>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>> >>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>> >>>>>>
>>>>>> >>>>>>       drm/scheduler: rework job destruction
>>>>>> >>>>> Regards,
>>>>>> >>>>> Christian.
>>>>>> >>>>>
>>>>>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>> >>>>>> Hi Christian,
>>>>>> >>>>>>         Please refer to follow log, when it enter to
>>>>>> >>>>>> amdgpu_device_gpu_recover
>>>>>> >>>>> function, the bad job 000000005086879e is freeing in function
>>>>>> >>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>>>>> >>> signal.
>>>>>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>> >>>>> s_fence is already freed, but job is not freed in time. Then this issue
>>>>>> >occurs.
>>>>>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>>>> >>> sdma0
>>>>>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>>> >>>>> process  pid 0 thread  pid 0, s_job:000000005086879e [  449.794163]
>>>>>> >>>>> amdgpu
>>>>>> >>>>> 0000:00:08.0: GPU reset begin!
>>>>>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>> >>>>>> process pid 0 thread pid 0, s_job:000000005086879e [  449.794221]
>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>> >>>>>> thread pid 0,
>>>>>> >>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>> >>>>> Emily:amdgpu_job_free_cb,Process
>>>>>> >>>>> information: process  pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>>>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>>>>>> >>>>> at
>>>>>> >>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>>> >>>>> 0000 [#1] SMP PTI
>>>>>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G           OE
>>>>>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>> >>>>>> 449.803488]
>>>>>> >>> RIP:
>>>>>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>>>> >>>>>> ff
>>>>>> >>>>>> 45 85 e4 0f
>>>>>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>>> >>>>> <48> 8b
>>>>>> >>> 98
>>>>>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>>> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>> >>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000
>>>>>> >CR0:
>>>>>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0 CR3:
>>>>>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>>>> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>> >>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>> >>>>>> worker_thread+0x34/0x410 [  449.814728] kthread+0x121/0x140 [
>>>>>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>> >>>>>>
>>>>>> >>>>>>> -----Original Message-----
>>>>>> >>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>> >>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>> >>> gfx@lists.freedesktop.org
>>>>>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>> >>>>>>> tdr
>>>>>> >>>>>>>
>>>>>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>> >>>>>>>> Sorry, please take your time.
>>>>>> >>>>>>> Have you seen my other response a bit below?
>>>>>> >>>>>>>
>>>>>> >>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>> >>>>>>> NULL without the job also being freed.
>>>>>> >>>>>>>
>>>>>> >>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>> >>>>>>>
>>>>>> >>>>>>> Regards,
>>>>>> >>>>>>> Christian.
>>>>>> >>>>>>>
>>>>>> >>>>>>>> Best wishes
>>>>>> >>>>>>>> Emily Deng
>>>>>> >>>>>>>>
>>>>>> >>>>>>>>
>>>>>> >>>>>>>>
>>>>>> >>>>>>>>> -----Original Message-----
>>>>>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>> >>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>> >>>>> gfx@lists.freedesktop.org
>>>>>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>> >>>>>>>>> tdr
>>>>>> >>>>>>>>>
>>>>>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>> >>>>>>>>>> Ping.....
>>>>>> >>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>> >>>>>>>>>
>>>>>> >>>>>>>>>> Best wishes
>>>>>> >>>>>>>>>> Emily Deng
>>>>>> >>>>>>>>>>
>>>>>> >>>>>>>>>>
>>>>>> >>>>>>>>>>
>>>>>> >>>>>>>>>>> -----Original Message-----
>>>>>> >>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>> >>> Behalf
>>>>>> >>>>>>>>>>> Of Deng, Emily
>>>>>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>> >>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>> >>>>>>>>>>> for tdr
>>>>>> >>>>>>>>>>>
>>>>>> >>>>>>>>>>>> -----Original Message-----
>>>>>> >>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>> >>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>> >>>>>>>>>>>> for tdr
>>>>>> >>>>>>>>>>>>
>>>>>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>> >>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>> >>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>>>>>> >>>>>>>>>>> go to free
>>>>>> >>>>> job.
>>>>>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>> >>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>> >>>>>>>>> No, that case can't happen. See here:
>>>>>> >>>>>>>>>
>>>>>> >>>>>>>>>>            drm_sched_job_cleanup(s_job);
>>>>>> >>>>>>>>>>
>>>>>> >>>>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>> >>>>>>>>>>            dma_fence_put(job->fence);
>>>>>> >>>>>>>>>>            amdgpu_sync_free(&job->sync);
>>>>>> >>>>>>>>>>            amdgpu_sync_free(&job->sched_sync);
>>>>>> >>>>>>>>>>            kfree(job);
>>>>>> >>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>> >>>>>>>>> to the
>>>>>> >>>>> s_fence.
>>>>>> >>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>> >>>>>>>>> patch is a clear NAK.
>>>>>> >>>>>>>>>
>>>>>> >>>>>>>>> Regards,
>>>>>> >>>>>>>>> Christian.
>>>>>> >>>>>>>>>
>>>>>> >>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>> >>>>>>>>>>>> problem is somewhere else.
>>>>>> >>>>>>>>>>>>
>>>>>> >>>>>>>>>>>> Regards,
>>>>>> >>>>>>>>>>>> Christian.
>>>>>> >>>>>>>>>>>>
>>>>>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>> >>>>>>>>>>>>> ---
>>>>>> >>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>> >>>>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>>>>>> >--
>>>>>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>> >>>>>>>>>>>>>
>>>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>> >>> amdgpu_device_gpu_recover(struct
>>>>>> >>>>>>>>>>>> amdgpu_device *adev,
>>>>>> >>>>>>>>>>>>>            *
>>>>>> >>>>>>>>>>>>>            * job->base holds a reference to parent fence
>>>>>> >>>>>>>>>>>>>            */
>>>>>> >>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>> >>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>> >>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>> >>>>>>> &&
>>>>>> >>>>>>>>>>>>>               dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>> >>>>>>>>>>>>>                   job_signaled = true;
>>>>>> >>>>>>>>>>>>>
>>>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>> >drm_sched_increase_karma(struct
>>>>>> >>>>>>>>>>>> drm_sched_job
>>>>>> >>>>>>>>>>>>> *bad)
>>>>>> >>>>>>>>>>>>>
>>>>>> >>>>>>>>>>>>>                           spin_lock(&rq->lock);
>>>>>> >>>>>>>>>>>>>                           list_for_each_entry_safe(entity,
>>>>>> >>>>>>>>>>>>> tmp,
>>>>>> >>> &rq-
>>>>>> >>>>>>>> entities,
>>>>>> >>>>>>>>>>>> list) {
>>>>>> >>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
>>>>>> >>>>>>> ==
>>>>>> >>>>>>>>>>>>> - entity->fence_context) {
>>>>>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>> >>>>>>>>>>>>> + (bad->s_fence-
>>>>>> >>>>>>>>>>>>> scheduled.context ==
>>>>>> >>>>>>>>>>>>> + entity->fence_context)) {
>>>>>> >>>>>>>>>>>>>                                           if
>>>>>> >>>>>>>>>>>>> (atomic_read(&bad-
>>>>>> >>>>>>>> karma) >
>>>>>> >>>>>>>>>>>>>                                               bad->sched-
>>>>>> >>>> hang_limit)
>>>>>> >>>>>>>>>>>>>                                                   if
>>>>>> >>>>>>>>>>>>> (entity-
>>>>>> >>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct
>>>>>> >>>>>>> drm_gpu_scheduler
>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>> >>>>>>>>>>>>>            * This iteration is thread safe as sched thread
>>>>>> >>>>>>>>>>>>> is
>>>>>> >>> stopped.
>>>>>> >>>>>>>>>>>>>            */
>>>>>> >>>>>>>>>>>>>           list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>> >>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>> >>>>>>>>>>>>>                       dma_fence_remove_callback(s_job-
>>>>>> >>>> s_fence-
>>>>>> >>>>>>>> parent,
>>>>>> >>>>>>>>>>>>>                                                 &s_job->cb)) {
>>>>>> >>>>>>>>>>>>>                           atomic_dec(&sched->hw_rq_count);
>>>>>> >>> @@ -
>>>>>> >>>>>>> 395,7
>>>>>> >>>>>>>>>>> +395,8 @@ void
>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>> >>>>>>>>>>>>>                            *
>>>>>> >>>>>>>>>>>>>                            * Job is still alive so fence
>>>>>> >>>>>>>>>>>>> refcount at
>>>>>> >>> least 1
>>>>>> >>>>>>>>>>>>>                            */
>>>>>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>> >>>>>>> false);
>>>>>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>> >>>>>>>> finished,
>>>>>> >>>>>>>>>>>> false);
>>>>>> >>>>>>>>>>>>>                           /*
>>>>>> >>>>>>>>>>>>>                            * We must keep bad job alive
>>>>>> >>>>>>>>>>>>> for later
>>>>>> >>> use
>>>>>> >>>>>>> during @@
>>>>>> >>>>>>>>>>>> -438,7
>>>>>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>>> >>>>> *sched,
>>>>>> >>>>>>>>>>>>> +bool
>>>>>> >>>>>>>>>>>> full_recovery)
>>>>>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>>>>>> >>>>>>>>>>>>>            */
>>>>>> >>>>>>>>>>>>>           list_for_each_entry_safe(s_job, tmp,
>>>>>> >>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>> >>>>>>>>>>>>> node)
>>>>>> >>>>>>>>>>>> {
>>>>>> >>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
>>>>>> >>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ?
>>>>>> >>>>>>>>>>>>> + s_job-
>>>>>> >>>>>>>> s_fence-
>>>>>> >>>>>>>>>>>>> parent :
>>>>>> >>>>>>>>>>>>> +NULL;
>>>>>> >>>>>>>>>>>>>
>>>>>> >>>>>>>>>>>>>                   atomic_inc(&sched->hw_rq_count);
>>>>>> >>>>>>>>>>>>>
>>>>>> >>>>>>>>>>> _______________________________________________
>>>>>> >>>>>>>>>>> amd-gfx mailing list
>>>>>> >>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> >>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>> >
>>>>>> >_______________________________________________
>>>>>> >amd-gfx mailing list
>>>>>> >amd-gfx@lists.freedesktop.org
>>>>>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>
>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>
>

[-- Attachment #1.2: Type: text/html, Size: 67539 bytes --]

[-- Attachment #2: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-14  8:12                                                                                     ` Christian König
  0 siblings, 0 replies; 80+ messages in thread
From: Christian König @ 2019-11-14  8:12 UTC (permalink / raw)
  To: Andrey Grodzovsky, Christian König, Deng, Emily,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

> What about instead of peeking at the job to actually remove it from 
> ring_mirror_list right there,
Also an interesting idea. We would need to protect the mirror list with 
a lock again, but that should be the lesser evil.

Maybe prototype that and see if it works or not.

Regards,
Christian.

Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>
>
> On 11/13/19 9:20 AM, Christian König wrote:
>> Another more fundamental question: Could we get rid of the timeout 
>> job at all?
>
>
> There are other stuff there besides picking the first unfinished job 
> which is common for all the drivers - such as freeing guilty signaled 
> job and rearming the timeout work timer.
>
>
>>
>> I mean we used to give this as parameter to the scheduler callback 
>> because we had the timeout worker in the job, but that is no longer 
>> the case.
>>
>> E.g. in drm_sched_job_timedout() we do the following:
>>>         job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>                                        struct drm_sched_job, node);
>>
>> Why don't we just remove that here and only get the first job after 
>> we have stopped the scheduler?
>
>
> Should be ok since we have the extra check for __kthread_should_park 
> in drm_sched_cleanup_jobs which will protect us in this case from a 
> wakeup of sched thread and execution of in drm_sched_cleanup_jobs 
> after we already parked it. The problem here is we need the 
> drm_sched_job to access the private data for each client driver (see 
> amdgpu_job_timedout for example). What about instead of peeking at the 
> job to actually remove it from ring_mirror_list right there, go ahead 
> with it through the reset routine, if it's signaled in the meanwhile 
> that great - release it, otherwise put it back into ring_mirror_list 
> in drm_sched_resubmit_jobs.
>
> Andrey
>
>
>>
>> Regards,
>> Christian.
>>
>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>
>>> This why I asked for a trace with timer enabled, but since there is 
>>> a finite number of places we touch the timer Emily can just put 
>>> prints there. Also, I wonder if this temp fix helps her with the 
>>> issue or not.
>>>
>>> Andrey
>>>
>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>> The question is where do we rearm the timer for this problem to occur?
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>
>>>>> I was able to reproduce the crash by using the attached 
>>>>> simulate_crash.patch - waiting on guilty job to signal in reset 
>>>>> work and artificially rearming the timeout timer just before the 
>>>>> check for !cancel_delayed_work(&sched->work_tdr)  in 
>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I 
>>>>> think confirms my theory i described earlier in this thread.
>>>>>
>>>>> basic_fix.patch handles this by testing whether another timer 
>>>>> already armed ob this scheduler or is there a timeout work in 
>>>>> execution right now (see documentation for work_busy) - obviously  
>>>>> this is not a full solution as this will not protect from races if 
>>>>> for example there is immediate work scheduling such as in 
>>>>> drm_sched_fault -  so we probably need to account for this by 
>>>>> making drm_sched_cleanup_jobs (at least in the part where it 
>>>>> iterates ring mirror list and frees jobs) and GPU reset really 
>>>>> mutually exclusive and not like now.
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>> Hi Emily,
>>>>>>
>>>>>> you need to print which scheduler instance is freeing the jobs 
>>>>>> and which one is triggering the reset. The TID and PID is 
>>>>>> completely meaningless here since we are called from different 
>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>
>>>>>> Apart from that I will look into this a bit deeper when I have time.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>> Hi Christian,
>>>>>>>     I add the follow print in function drm_sched_cleanup_jobs. 
>>>>>>> From the log it shows that only use cancel_delayed_work could 
>>>>>>> not avoid to free job when the sched is in reset. But don’t know 
>>>>>>> exactly where it is wrong about the driver. Do you have any 
>>>>>>> suggestion about this?
>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, 
>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>         /*
>>>>>>>          * Don't destroy jobs while the timeout worker is 
>>>>>>> running  OR thread
>>>>>>>          * is being parked and hence assumed to not touch 
>>>>>>> ring_mirror_list
>>>>>>>          */
>>>>>>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>                 return;
>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
>>>>>>> current->tgid, current->pid);
>>>>>>> Best wishes
>>>>>>> Emily Deng
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring 
>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* 
>>>>>>> Process information: process pid 0 thread  pid 0, 
>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information: 
>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, 
>>>>>>> pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, pid:2253
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information: 
>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, 
>>>>>>> pid:2262
>>>>>>> >-----Original Message-----
>>>>>>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>> >Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>> >To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>>> ><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>> >
>>>>>>> >Thinking more about this claim - we assume here that if cancel_delayed_work
>>>>>>> >returned true it guarantees that timeout work is not running but, it merely
>>>>>>> >means there was a pending timeout work which was removed from the
>>>>>>> >workqueue before it's timer elapsed and so it didn't have a chance to be
>>>>>>> >dequeued and executed, it doesn't cover already executing work. So there is a
>>>>>>> >possibility where while timeout work started executing another timeout work
>>>>>>> >already got enqueued (maybe through earlier cleanup jobs or through
>>>>>>> >drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs
>>>>>>> >cancel_delayed_work(&sched->work_tdr) will return true even while there is a
>>>>>>> >timeout job in progress.
>>>>>>> >Unfortunately we cannot change cancel_delayed_work to
>>>>>>> >cancel_delayed_work_sync to flush the timeout work as timeout work itself
>>>>>>> >waits for schedule thread  to be parked again when calling park_thread.
>>>>>>> >
>>>>>>> >Andrey
>>>>>>> >
>>>>>>> >________________________________________
>>>>>>> >From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of
>>>>>>> >Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> >Sent: 08 November 2019 05:35:18
>>>>>>> >To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>> >
>>>>>>> >Hi Emily,
>>>>>>> >
>>>>>>> >exactly that can't happen. See here:
>>>>>>> >
>>>>>>> >>         /* Don't destroy jobs while the timeout worker is running */
>>>>>>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>> >>            !cancel_delayed_work(&sched->work_tdr))
>>>>>>> >>                 return NULL;
>>>>>>> >
>>>>>>> >We never free jobs while the timeout working is running to prevent exactly
>>>>>>> >that issue.
>>>>>>> >
>>>>>>> >Regards,
>>>>>>> >Christian.
>>>>>>> >
>>>>>>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>> >> Hi Christian,
>>>>>>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>>>>> >amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>>>>>>> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>>>>> >>
>>>>>>> >> Best wishes
>>>>>>> >> Emily Deng
>>>>>>> >>
>>>>>>> >>
>>>>>>> >>
>>>>>>> >>> -----Original Message-----
>>>>>>> >>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> >>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>> >>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>> >>>
>>>>>>> >>> Hi Emily,
>>>>>>> >>>
>>>>>>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>>> >>>
>>>>>>> >>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>>>>> >>>
>>>>>>> >>> Regards,
>>>>>>> >>> Christian.
>>>>>>> >>>
>>>>>>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>> >>>> Hi Chrisitan,
>>>>>>> >>>>        No, I am with the new branch and also has the patch. Even it
>>>>>>> >>>> are freed by
>>>>>>> >>> main scheduler, how we could avoid main scheduler to free jobs while
>>>>>>> >>> enter to function amdgpu_device_gpu_recover?
>>>>>>> >>>> Best wishes
>>>>>>> >>>> Emily Deng
>>>>>>> >>>>
>>>>>>> >>>>
>>>>>>> >>>>
>>>>>>> >>>>> -----Original Message-----
>>>>>>> >>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>> >>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>> >gfx@lists.freedesktop.org
>>>>>>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>> >>>>>
>>>>>>> >>>>> Hi Emily,
>>>>>>> >>>>>
>>>>>>> >>>>> in this case you are on an old code branch.
>>>>>>> >>>>>
>>>>>>> >>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>>>> >>>>> timeout handler is running.
>>>>>>> >>>>>
>>>>>>> >>>>> See this patch here:
>>>>>>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>> >>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>> >>>>>>
>>>>>>> >>>>>>       drm/scheduler: rework job destruction
>>>>>>> >>>>> Regards,
>>>>>>> >>>>> Christian.
>>>>>>> >>>>>
>>>>>>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>> >>>>>> Hi Christian,
>>>>>>> >>>>>>         Please refer to follow log, when it enter to
>>>>>>> >>>>>> amdgpu_device_gpu_recover
>>>>>>> >>>>> function, the bad job 000000005086879e is freeing in function
>>>>>>> >>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>>>>>> >>> signal.
>>>>>>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>>> >>>>> s_fence is already freed, but job is not freed in time. Then this issue
>>>>>>> >occurs.
>>>>>>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>>>>> >>> sdma0
>>>>>>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>>>> >>>>> process  pid 0 thread pid 0, s_job:000000005086879e [  449.794163]
>>>>>>> >>>>> amdgpu
>>>>>>> >>>>> 0000:00:08.0: GPU reset begin!
>>>>>>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>>> >>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [ 449.794221]
>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> >>>>>> thread pid 0,
>>>>>>> >>>>> s_job:00000000ea85e922 [  449.794287]
>>>>>>> >>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>> >>>>> information: process pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>>>>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>>>>>>> >>>>> at
>>>>>>> >>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>>>> >>>>> 0000 [#1] SMP PTI
>>>>>>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G OE
>>>>>>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>> >>>>>> 449.803488]
>>>>>>> >>> RIP:
>>>>>>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>>>>> >>>>>> ff
>>>>>>> >>>>>> 45 85 e4 0f
>>>>>>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>>>> >>>>> <48> 8b
>>>>>>> >>> 98
>>>>>>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>>> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>>>> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>> >>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000
>>>>>>> >CR0:
>>>>>>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0 CR3:
>>>>>>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>>>>> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>>> >>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>> >>>>>> worker_thread+0x34/0x410 [  449.814728] kthread+0x121/0x140 [
>>>>>>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>> >>>>>>
>>>>>>> >>>>>>> -----Original Message-----
>>>>>>> >>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>> >>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>> >>> gfx@lists.freedesktop.org
>>>>>>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> >>>>>>> tdr
>>>>>>> >>>>>>>
>>>>>>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>> >>>>>>>> Sorry, please take your time.
>>>>>>> >>>>>>> Have you seen my other response a bit below?
>>>>>>> >>>>>>>
>>>>>>> >>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>>> >>>>>>> NULL without the job also being freed.
>>>>>>> >>>>>>>
>>>>>>> >>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>> >>>>>>>
>>>>>>> >>>>>>> Regards,
>>>>>>> >>>>>>> Christian.
>>>>>>> >>>>>>>
>>>>>>> >>>>>>>> Best wishes
>>>>>>> >>>>>>>> Emily Deng
>>>>>>> >>>>>>>>
>>>>>>> >>>>>>>>
>>>>>>> >>>>>>>>
>>>>>>> >>>>>>>>> -----Original Message-----
>>>>>>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>> >>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>> >>>>> gfx@lists.freedesktop.org
>>>>>>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> >>>>>>>>> tdr
>>>>>>> >>>>>>>>>
>>>>>>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>> >>>>>>>>>> Ping.....
>>>>>>> >>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>> >>>>>>>>>
>>>>>>> >>>>>>>>>> Best wishes
>>>>>>> >>>>>>>>>> Emily Deng
>>>>>>> >>>>>>>>>>
>>>>>>> >>>>>>>>>>
>>>>>>> >>>>>>>>>>
>>>>>>> >>>>>>>>>>> -----Original Message-----
>>>>>>> >>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>> >>> Behalf
>>>>>>> >>>>>>>>>>> Of Deng, Emily
>>>>>>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>> >>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>> >>>>>>>>>>> for tdr
>>>>>>> >>>>>>>>>>>
>>>>>>> >>>>>>>>>>>> -----Original Message-----
>>>>>>> >>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>> >>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>> >>>>>>>>>>>> for tdr
>>>>>>> >>>>>>>>>>>>
>>>>>>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>> >>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>> >>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>>>>>>> >>>>>>>>>>> go to free
>>>>>>> >>>>> job.
>>>>>>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>>> >>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>> >>>>>>>>> No, that case can't happen. See here:
>>>>>>> >>>>>>>>>
>>>>>>> >>>>>>>>>>            drm_sched_job_cleanup(s_job);
>>>>>>> >>>>>>>>>>
>>>>>>> >>>>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>> >>>>>>>>>>            dma_fence_put(job->fence);
>>>>>>> >>>>>>>>>>            amdgpu_sync_free(&job->sync);
>>>>>>> >>>>>>>>>>            amdgpu_sync_free(&job->sched_sync);
>>>>>>> >>>>>>>>>>            kfree(job);
>>>>>>> >>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>>> >>>>>>>>> to the
>>>>>>> >>>>> s_fence.
>>>>>>> >>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>> >>>>>>>>> patch is a clear NAK.
>>>>>>> >>>>>>>>>
>>>>>>> >>>>>>>>> Regards,
>>>>>>> >>>>>>>>> Christian.
>>>>>>> >>>>>>>>>
>>>>>>> >>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>> >>>>>>>>>>>> problem is somewhere else.
>>>>>>> >>>>>>>>>>>>
>>>>>>> >>>>>>>>>>>> Regards,
>>>>>>> >>>>>>>>>>>> Christian.
>>>>>>> >>>>>>>>>>>>
>>>>>>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>> >>>>>>>>>>>>> ---
>>>>>>> >>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>> >>>>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>>>>>>> >--
>>>>>>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>> >>>>>>>>>>>>>
>>>>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>> >>> amdgpu_device_gpu_recover(struct
>>>>>>> >>>>>>>>>>>> amdgpu_device *adev,
>>>>>>> >>>>>>>>>>>>>            *
>>>>>>> >>>>>>>>>>>>>            * job->base holds a reference to parent fence
>>>>>>> >>>>>>>>>>>>>            */
>>>>>>> >>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>> >>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>> >>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>> >>>>>>> &&
>>>>>>> >>>>>>>>>>>>>               dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>> >>>>>>>>>>>>>                   job_signaled = true;
>>>>>>> >>>>>>>>>>>>>
>>>>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>> >drm_sched_increase_karma(struct
>>>>>>> >>>>>>>>>>>> drm_sched_job
>>>>>>> >>>>>>>>>>>>> *bad)
>>>>>>> >>>>>>>>>>>>>
>>>>>>> >>>>>>>>>>>>>                           spin_lock(&rq->lock);
>>>>>>> >>>>>>>>>>>>>                           list_for_each_entry_safe(entity,
>>>>>>> >>>>>>>>>>>>> tmp,
>>>>>>> >>> &rq-
>>>>>>> >>>>>>>> entities,
>>>>>>> >>>>>>>>>>>> list) {
>>>>>>> >>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
>>>>>>> >>>>>>> ==
>>>>>>> >>>>>>>>>>>>> - entity->fence_context) {
>>>>>>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>> >>>>>>>>>>>>> + (bad->s_fence-
>>>>>>> >>>>>>>>>>>>> scheduled.context ==
>>>>>>> >>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>> >>>>>>>>>>>>>                                           if
>>>>>>> >>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>> >>>>>>>> karma) >
>>>>>>> >>>>>>>>>>>>>                                               bad->sched-
>>>>>>> >>>> hang_limit)
>>>>>>> >>>>>>>>>>>>>                                                   if
>>>>>>> >>>>>>>>>>>>> (entity-
>>>>>>> >>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>> >>>>>>> drm_gpu_scheduler
>>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>> >>>>>>>>>>>>>            * This iteration is thread safe as sched thread
>>>>>>> >>>>>>>>>>>>> is
>>>>>>> >>> stopped.
>>>>>>> >>>>>>>>>>>>>            */
>>>>>>> >>>>>>>>>>>>>           list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>> >>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>> >>>>>>>>>>>>>                       dma_fence_remove_callback(s_job-
>>>>>>> >>>> s_fence-
>>>>>>> >>>>>>>> parent,
>>>>>>> >>>>>>>>>>>>>                                                 &s_job->cb)) {
>>>>>>> >>>>>>>>>>>>>                           atomic_dec(&sched->hw_rq_count);
>>>>>>> >>> @@ -
>>>>>>> >>>>>>> 395,7
>>>>>>> >>>>>>>>>>> +395,8 @@ void
>>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>> >>>>>>>>>>>>>                            *
>>>>>>> >>>>>>>>>>>>>                            * Job is still alive so fence
>>>>>>> >>>>>>>>>>>>> refcount at
>>>>>>> >>> least 1
>>>>>>> >>>>>>>>>>>>>                            */
>>>>>>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>> >>>>>>> false);
>>>>>>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>>>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>> >>>>>>>> finished,
>>>>>>> >>>>>>>>>>>> false);
>>>>>>> >>>>>>>>>>>>>                           /*
>>>>>>> >>>>>>>>>>>>>                            * We must keep bad job alive
>>>>>>> >>>>>>>>>>>>> for later
>>>>>>> >>> use
>>>>>>> >>>>>>> during @@
>>>>>>> >>>>>>>>>>>> -438,7
>>>>>>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>>>> >>>>> *sched,
>>>>>>> >>>>>>>>>>>>> +bool
>>>>>>> >>>>>>>>>>>> full_recovery)
>>>>>>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>>>>>>> >>>>>>>>>>>>>            */
>>>>>>> >>>>>>>>>>>>>           list_for_each_entry_safe(s_job, tmp,
>>>>>>> >>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>> >>>>>>>>>>>>> node)
>>>>>>> >>>>>>>>>>>> {
>>>>>>> >>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>> >>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ?
>>>>>>> >>>>>>>>>>>>> + s_job-
>>>>>>> >>>>>>>> s_fence-
>>>>>>> >>>>>>>>>>>>> parent :
>>>>>>> >>>>>>>>>>>>> +NULL;
>>>>>>> >>>>>>>>>>>>>
>>>>>>> >>>>>>>>>>>>>                   atomic_inc(&sched->hw_rq_count);
>>>>>>> >>>>>>>>>>>>>
>>>>>>> >>>>>>>>>>> _______________________________________________
>>>>>>> >>>>>>>>>>> amd-gfx mailing list
>>>>>>> >>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> >>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>> >
>>>>>>> >_______________________________________________
>>>>>>> >amd-gfx mailing list
>>>>>>> >amd-gfx@lists.freedesktop.org
>>>>>>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>
>>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-14  8:12                                                                                     ` Christian König
  0 siblings, 0 replies; 80+ messages in thread
From: Christian König @ 2019-11-14  8:12 UTC (permalink / raw)
  To: Andrey Grodzovsky, Christian König, Deng, Emily, amd-gfx

> What about instead of peeking at the job to actually remove it from 
> ring_mirror_list right there,
Also an interesting idea. We would need to protect the mirror list with 
a lock again, but that should be the lesser evil.

Maybe prototype that and see if it works or not.

Regards,
Christian.

Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>
>
> On 11/13/19 9:20 AM, Christian König wrote:
>> Another more fundamental question: Could we get rid of the timeout 
>> job at all?
>
>
> There are other stuff there besides picking the first unfinished job 
> which is common for all the drivers - such as freeing guilty signaled 
> job and rearming the timeout work timer.
>
>
>>
>> I mean we used to give this as parameter to the scheduler callback 
>> because we had the timeout worker in the job, but that is no longer 
>> the case.
>>
>> E.g. in drm_sched_job_timedout() we do the following:
>>>         job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>                                        struct drm_sched_job, node);
>>
>> Why don't we just remove that here and only get the first job after 
>> we have stopped the scheduler?
>
>
> Should be ok since we have the extra check for __kthread_should_park 
> in drm_sched_cleanup_jobs which will protect us in this case from a 
> wakeup of sched thread and execution of in drm_sched_cleanup_jobs 
> after we already parked it. The problem here is we need the 
> drm_sched_job to access the private data for each client driver (see 
> amdgpu_job_timedout for example). What about instead of peeking at the 
> job to actually remove it from ring_mirror_list right there, go ahead 
> with it through the reset routine, if it's signaled in the meanwhile 
> that great - release it, otherwise put it back into ring_mirror_list 
> in drm_sched_resubmit_jobs.
>
> Andrey
>
>
>>
>> Regards,
>> Christian.
>>
>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>
>>> This why I asked for a trace with timer enabled, but since there is 
>>> a finite number of places we touch the timer Emily can just put 
>>> prints there. Also, I wonder if this temp fix helps her with the 
>>> issue or not.
>>>
>>> Andrey
>>>
>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>> The question is where do we rearm the timer for this problem to occur?
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>
>>>>> I was able to reproduce the crash by using the attached 
>>>>> simulate_crash.patch - waiting on guilty job to signal in reset 
>>>>> work and artificially rearming the timeout timer just before the 
>>>>> check for !cancel_delayed_work(&sched->work_tdr)  in 
>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I 
>>>>> think confirms my theory i described earlier in this thread.
>>>>>
>>>>> basic_fix.patch handles this by testing whether another timer 
>>>>> already armed ob this scheduler or is there a timeout work in 
>>>>> execution right now (see documentation for work_busy) - obviously  
>>>>> this is not a full solution as this will not protect from races if 
>>>>> for example there is immediate work scheduling such as in 
>>>>> drm_sched_fault -  so we probably need to account for this by 
>>>>> making drm_sched_cleanup_jobs (at least in the part where it 
>>>>> iterates ring mirror list and frees jobs) and GPU reset really 
>>>>> mutually exclusive and not like now.
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>> Hi Emily,
>>>>>>
>>>>>> you need to print which scheduler instance is freeing the jobs 
>>>>>> and which one is triggering the reset. The TID and PID is 
>>>>>> completely meaningless here since we are called from different 
>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>
>>>>>> Apart from that I will look into this a bit deeper when I have time.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>> Hi Christian,
>>>>>>>     I add the follow print in function drm_sched_cleanup_jobs. 
>>>>>>> From the log it shows that only use cancel_delayed_work could 
>>>>>>> not avoid to free job when the sched is in reset. But don’t know 
>>>>>>> exactly where it is wrong about the driver. Do you have any 
>>>>>>> suggestion about this?
>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, 
>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>         /*
>>>>>>>          * Don't destroy jobs while the timeout worker is 
>>>>>>> running  OR thread
>>>>>>>          * is being parked and hence assumed to not touch 
>>>>>>> ring_mirror_list
>>>>>>>          */
>>>>>>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>                 return;
>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
>>>>>>> current->tgid, current->pid);
>>>>>>> Best wishes
>>>>>>> Emily Deng
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring 
>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* 
>>>>>>> Process information: process pid 0 thread  pid 0, 
>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information: 
>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, 
>>>>>>> pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, pid:2253
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information: 
>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, 
>>>>>>> pid:2262
>>>>>>> >-----Original Message-----
>>>>>>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>> >Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>> >To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>>> ><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>> >
>>>>>>> >Thinking more about this claim - we assume here that if cancel_delayed_work
>>>>>>> >returned true it guarantees that timeout work is not running but, it merely
>>>>>>> >means there was a pending timeout work which was removed from the
>>>>>>> >workqueue before it's timer elapsed and so it didn't have a chance to be
>>>>>>> >dequeued and executed, it doesn't cover already executing work. So there is a
>>>>>>> >possibility where while timeout work started executing another timeout work
>>>>>>> >already got enqueued (maybe through earlier cleanup jobs or through
>>>>>>> >drm_sched_fault) and if at this point another drm_sched_cleanup_jobs runs
>>>>>>> >cancel_delayed_work(&sched->work_tdr) will return true even while there is a
>>>>>>> >timeout job in progress.
>>>>>>> >Unfortunately we cannot change cancel_delayed_work to
>>>>>>> >cancel_delayed_work_sync to flush the timeout work as timeout work itself
>>>>>>> >waits for schedule thread  to be parked again when calling park_thread.
>>>>>>> >
>>>>>>> >Andrey
>>>>>>> >
>>>>>>> >________________________________________
>>>>>>> >From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of
>>>>>>> >Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> >Sent: 08 November 2019 05:35:18
>>>>>>> >To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>> >
>>>>>>> >Hi Emily,
>>>>>>> >
>>>>>>> >exactly that can't happen. See here:
>>>>>>> >
>>>>>>> >>         /* Don't destroy jobs while the timeout worker is running */
>>>>>>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>> >>            !cancel_delayed_work(&sched->work_tdr))
>>>>>>> >>                 return NULL;
>>>>>>> >
>>>>>>> >We never free jobs while the timeout working is running to prevent exactly
>>>>>>> >that issue.
>>>>>>> >
>>>>>>> >Regards,
>>>>>>> >Christian.
>>>>>>> >
>>>>>>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>> >> Hi Christian,
>>>>>>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>>>>> >amdgpu_device_gpu_recover. I mean the main scheduler free the jobs while
>>>>>>> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>>>>> >>
>>>>>>> >> Best wishes
>>>>>>> >> Emily Deng
>>>>>>> >>
>>>>>>> >>
>>>>>>> >>
>>>>>>> >>> -----Original Message-----
>>>>>>> >>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> >>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>> >>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>> >>>
>>>>>>> >>> Hi Emily,
>>>>>>> >>>
>>>>>>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>>> >>>
>>>>>>> >>> When it's not the scheduler we shouldn't have a guilty job in the first place.
>>>>>>> >>>
>>>>>>> >>> Regards,
>>>>>>> >>> Christian.
>>>>>>> >>>
>>>>>>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>> >>>> Hi Chrisitan,
>>>>>>> >>>>        No, I am with the new branch and also has the patch. Even it
>>>>>>> >>>> are freed by
>>>>>>> >>> main scheduler, how we could avoid main scheduler to free jobs while
>>>>>>> >>> enter to function amdgpu_device_gpu_recover?
>>>>>>> >>>> Best wishes
>>>>>>> >>>> Emily Deng
>>>>>>> >>>>
>>>>>>> >>>>
>>>>>>> >>>>
>>>>>>> >>>>> -----Original Message-----
>>>>>>> >>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>> >>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>> >gfx@lists.freedesktop.org
>>>>>>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>> >>>>>
>>>>>>> >>>>> Hi Emily,
>>>>>>> >>>>>
>>>>>>> >>>>> in this case you are on an old code branch.
>>>>>>> >>>>>
>>>>>>> >>>>> Jobs are freed now by the main scheduler thread and only if no
>>>>>>> >>>>> timeout handler is running.
>>>>>>> >>>>>
>>>>>>> >>>>> See this patch here:
>>>>>>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>> >>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>> >>>>>>
>>>>>>> >>>>>>       drm/scheduler: rework job destruction
>>>>>>> >>>>> Regards,
>>>>>>> >>>>> Christian.
>>>>>>> >>>>>
>>>>>>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>> >>>>>> Hi Christian,
>>>>>>> >>>>>>         Please refer to follow log, when it enter to
>>>>>>> >>>>>> amdgpu_device_gpu_recover
>>>>>>> >>>>> function, the bad job 000000005086879e is freeing in function
>>>>>>> >>>>> amdgpu_job_free_cb  at the same time, because of the hardware fence
>>>>>>> >>> signal.
>>>>>>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>>> >>>>> s_fence is already freed, but job is not freed in time. Then this issue
>>>>>>> >occurs.
>>>>>>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
>>>>>>> >>> sdma0
>>>>>>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
>>>>>>> >>>>> process  pid 0 thread pid 0, s_job:000000005086879e [  449.794163]
>>>>>>> >>>>> amdgpu
>>>>>>> >>>>> 0000:00:08.0: GPU reset begin!
>>>>>>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process information:
>>>>>>> >>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [ 449.794221]
>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  pid 0
>>>>>>> >>>>>> thread pid 0,
>>>>>>> >>>>> s_job:00000000ea85e922 [  449.794287]
>>>>>>> >>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>> >>>>> information: process pid 0 thread  pid 0, s_job:00000000ed3a5ac6 [
>>>>>>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer dereference
>>>>>>> >>>>> at
>>>>>>> >>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0 [  449.801040] Oops:
>>>>>>> >>>>> 0000 [#1] SMP PTI
>>>>>>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: G OE
>>>>>>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + PIIX,
>>>>>>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>> >>>>>> 449.803488]
>>>>>>> >>> RIP:
>>>>>>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 56 ff ff
>>>>>>> >>>>>> ff
>>>>>>> >>>>>> 45 85 e4 0f
>>>>>>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 8b 40 10
>>>>>>> >>>>> <48> 8b
>>>>>>> >>> 98
>>>>>>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 48 a8 01
>>>>>>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 00010286 [
>>>>>>> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>>>>>>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 0000000000000000 [
>>>>>>> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 R12:
>>>>>>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>> >>>>>> knlGS:0000000000000000 [  449.809674] CS:  0010 DS: 0000 ES: 0000
>>>>>>> >CR0:
>>>>>>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0 CR3:
>>>>>>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [
>>>>>>> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>>> >>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 [amdgpu] [
>>>>>>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>> >>>>>> worker_thread+0x34/0x410 [  449.814728] kthread+0x121/0x140 [
>>>>>>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  449.815374]  ?
>>>>>>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>> >>>>>>
>>>>>>> >>>>>>> -----Original Message-----
>>>>>>> >>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>> >>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>> >>> gfx@lists.freedesktop.org
>>>>>>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> >>>>>>> tdr
>>>>>>> >>>>>>>
>>>>>>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>> >>>>>>>> Sorry, please take your time.
>>>>>>> >>>>>>> Have you seen my other response a bit below?
>>>>>>> >>>>>>>
>>>>>>> >>>>>>> I can't follow how it would be possible for job->s_fence to be
>>>>>>> >>>>>>> NULL without the job also being freed.
>>>>>>> >>>>>>>
>>>>>>> >>>>>>> So it looks like this patch is just papering over some bigger issues.
>>>>>>> >>>>>>>
>>>>>>> >>>>>>> Regards,
>>>>>>> >>>>>>> Christian.
>>>>>>> >>>>>>>
>>>>>>> >>>>>>>> Best wishes
>>>>>>> >>>>>>>> Emily Deng
>>>>>>> >>>>>>>>
>>>>>>> >>>>>>>>
>>>>>>> >>>>>>>>
>>>>>>> >>>>>>>>> -----Original Message-----
>>>>>>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>> >>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>> >>>>> gfx@lists.freedesktop.org
>>>>>>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for
>>>>>>> >>>>>>>>> tdr
>>>>>>> >>>>>>>>>
>>>>>>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>> >>>>>>>>>> Ping.....
>>>>>>> >>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>> >>>>>>>>>
>>>>>>> >>>>>>>>>> Best wishes
>>>>>>> >>>>>>>>>> Emily Deng
>>>>>>> >>>>>>>>>>
>>>>>>> >>>>>>>>>>
>>>>>>> >>>>>>>>>>
>>>>>>> >>>>>>>>>>> -----Original Message-----
>>>>>>> >>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>> >>> Behalf
>>>>>>> >>>>>>>>>>> Of Deng, Emily
>>>>>>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>> >>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>> >>>>>>>>>>> for tdr
>>>>>>> >>>>>>>>>>>
>>>>>>> >>>>>>>>>>>> -----Original Message-----
>>>>>>> >>>>>>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>> >>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>> >>>>>>>>>>>> for tdr
>>>>>>> >>>>>>>>>>>>
>>>>>>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is freed.
>>>>>>> >>>>>>>>>>>>> Then it will has null pointer in amdgpu_device_gpu_recover.
>>>>>>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job is destroyed.
>>>>>>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. But in one
>>>>>>> >>>>>>>>>>> case, when it enter into the amdgpu_device_gpu_recover, it
>>>>>>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, it will
>>>>>>> >>>>>>>>>>> go to free
>>>>>>> >>>>> job.
>>>>>>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is faster. At
>>>>>>> >>>>>>>>>>> that time, job is not freed, but s_fence is already NULL.
>>>>>>> >>>>>>>>> No, that case can't happen. See here:
>>>>>>> >>>>>>>>>
>>>>>>> >>>>>>>>>>            drm_sched_job_cleanup(s_job);
>>>>>>> >>>>>>>>>>
>>>>>>> >>>>>>>>>>            amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>> >>>>>>>>>>            dma_fence_put(job->fence);
>>>>>>> >>>>>>>>>>            amdgpu_sync_free(&job->sync);
>>>>>>> >>>>>>>>>>            amdgpu_sync_free(&job->sched_sync);
>>>>>>> >>>>>>>>>>            kfree(job);
>>>>>>> >>>>>>>>> The job itself is freed up directly after freeing the reference
>>>>>>> >>>>>>>>> to the
>>>>>>> >>>>> s_fence.
>>>>>>> >>>>>>>>> So you are just papering over a much bigger problem here. This
>>>>>>> >>>>>>>>> patch is a clear NAK.
>>>>>>> >>>>>>>>>
>>>>>>> >>>>>>>>> Regards,
>>>>>>> >>>>>>>>> Christian.
>>>>>>> >>>>>>>>>
>>>>>>> >>>>>>>>>>>> When you see a job without an s_fence then that means the
>>>>>>> >>>>>>>>>>>> problem is somewhere else.
>>>>>>> >>>>>>>>>>>>
>>>>>>> >>>>>>>>>>>> Regards,
>>>>>>> >>>>>>>>>>>> Christian.
>>>>>>> >>>>>>>>>>>>
>>>>>>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>> >>>>>>>>>>>>> ---
>>>>>>> >>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>> >>>>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c     | 11 ++++++---
>>>>>>> >--
>>>>>>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 deletions(-)
>>>>>>> >>>>>>>>>>>>>
>>>>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>> >>> amdgpu_device_gpu_recover(struct
>>>>>>> >>>>>>>>>>>> amdgpu_device *adev,
>>>>>>> >>>>>>>>>>>>>            *
>>>>>>> >>>>>>>>>>>>>            * job->base holds a reference to parent fence
>>>>>>> >>>>>>>>>>>>>            */
>>>>>>> >>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>> >>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>> >>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>> >>>>>>> &&
>>>>>>> >>>>>>>>>>>>>               dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>> >>>>>>>>>>>>>                   job_signaled = true;
>>>>>>> >>>>>>>>>>>>>
>>>>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>> >drm_sched_increase_karma(struct
>>>>>>> >>>>>>>>>>>> drm_sched_job
>>>>>>> >>>>>>>>>>>>> *bad)
>>>>>>> >>>>>>>>>>>>>
>>>>>>> >>>>>>>>>>>>>                           spin_lock(&rq->lock);
>>>>>>> >>>>>>>>>>>>>                           list_for_each_entry_safe(entity,
>>>>>>> >>>>>>>>>>>>> tmp,
>>>>>>> >>> &rq-
>>>>>>> >>>>>>>> entities,
>>>>>>> >>>>>>>>>>>> list) {
>>>>>>> >>>>>>>>>>>>> -                          if (bad->s_fence->scheduled.context
>>>>>>> >>>>>>> ==
>>>>>>> >>>>>>>>>>>>> - entity->fence_context) {
>>>>>>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>> >>>>>>>>>>>>> + (bad->s_fence-
>>>>>>> >>>>>>>>>>>>> scheduled.context ==
>>>>>>> >>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>> >>>>>>>>>>>>>                                           if
>>>>>>> >>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>> >>>>>>>> karma) >
>>>>>>> >>>>>>>>>>>>>                                               bad->sched-
>>>>>>> >>>> hang_limit)
>>>>>>> >>>>>>>>>>>>>                                                   if
>>>>>>> >>>>>>>>>>>>> (entity-
>>>>>>> >>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>> >>>>>>> drm_gpu_scheduler
>>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>> >>>>>>>>>>>>>            * This iteration is thread safe as sched thread
>>>>>>> >>>>>>>>>>>>> is
>>>>>>> >>> stopped.
>>>>>>> >>>>>>>>>>>>>            */
>>>>>>> >>>>>>>>>>>>>           list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>> >>>>>>>>>>>>> +          if (s_job->s_fence && s_job->s_fence->parent &&
>>>>>>> >>>>>>>>>>>>>                       dma_fence_remove_callback(s_job-
>>>>>>> >>>> s_fence-
>>>>>>> >>>>>>>> parent,
>>>>>>> >>>>>>>>>>>>>                                                 &s_job->cb)) {
>>>>>>> >>>>>>>>>>>>>                           atomic_dec(&sched->hw_rq_count);
>>>>>>> >>> @@ -
>>>>>>> >>>>>>> 395,7
>>>>>>> >>>>>>>>>>> +395,8 @@ void
>>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>> >>>>>>>>>>>>>                            *
>>>>>>> >>>>>>>>>>>>>                            * Job is still alive so fence
>>>>>>> >>>>>>>>>>>>> refcount at
>>>>>>> >>> least 1
>>>>>>> >>>>>>>>>>>>>                            */
>>>>>>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>> >>>>>>> false);
>>>>>>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>>>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>> >>>>>>>> finished,
>>>>>>> >>>>>>>>>>>> false);
>>>>>>> >>>>>>>>>>>>>                           /*
>>>>>>> >>>>>>>>>>>>>                            * We must keep bad job alive
>>>>>>> >>>>>>>>>>>>> for later
>>>>>>> >>> use
>>>>>>> >>>>>>> during @@
>>>>>>> >>>>>>>>>>>> -438,7
>>>>>>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct drm_gpu_scheduler
>>>>>>> >>>>> *sched,
>>>>>>> >>>>>>>>>>>>> +bool
>>>>>>> >>>>>>>>>>>> full_recovery)
>>>>>>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>>>>>>> >>>>>>>>>>>>>            */
>>>>>>> >>>>>>>>>>>>>           list_for_each_entry_safe(s_job, tmp,
>>>>>>> >>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>> >>>>>>>>>>>>> node)
>>>>>>> >>>>>>>>>>>> {
>>>>>>> >>>>>>>>>>>>> -          struct dma_fence *fence = s_job->s_fence->parent;
>>>>>>> >>>>>>>>>>>>> +          struct dma_fence *fence = s_job->s_fence ?
>>>>>>> >>>>>>>>>>>>> + s_job-
>>>>>>> >>>>>>>> s_fence-
>>>>>>> >>>>>>>>>>>>> parent :
>>>>>>> >>>>>>>>>>>>> +NULL;
>>>>>>> >>>>>>>>>>>>>
>>>>>>> >>>>>>>>>>>>>                   atomic_inc(&sched->hw_rq_count);
>>>>>>> >>>>>>>>>>>>>
>>>>>>> >>>>>>>>>>> _______________________________________________
>>>>>>> >>>>>>>>>>> amd-gfx mailing list
>>>>>>> >>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> >>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>> >
>>>>>>> >_______________________________________________
>>>>>>> >amd-gfx mailing list
>>>>>>> >amd-gfx@lists.freedesktop.org
>>>>>>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>
>>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-14 15:53                                                                                         ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-14 15:53 UTC (permalink / raw)
  To: christian.koenig-5C7GfCeVMHo, Deng, Emily,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

ok

Andrey

On 11/14/19 3:12 AM, Christian König wrote:
>> What about instead of peeking at the job to actually remove it from 
>> ring_mirror_list right there,
> Also an interesting idea. We would need to protect the mirror list 
> with a lock again, but that should be the lesser evil.
>
> Maybe prototype that and see if it works or not.
>
> Regards,
> Christian.
>
> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>
>>
>> On 11/13/19 9:20 AM, Christian König wrote:
>>> Another more fundamental question: Could we get rid of the timeout 
>>> job at all?
>>
>>
>> There are other stuff there besides picking the first unfinished job 
>> which is common for all the drivers - such as freeing guilty signaled 
>> job and rearming the timeout work timer.
>>
>>
>>>
>>> I mean we used to give this as parameter to the scheduler callback 
>>> because we had the timeout worker in the job, but that is no longer 
>>> the case.
>>>
>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>         job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>                                        struct drm_sched_job, node);
>>>
>>> Why don't we just remove that here and only get the first job after 
>>> we have stopped the scheduler?
>>
>>
>> Should be ok since we have the extra check for __kthread_should_park 
>> in drm_sched_cleanup_jobs which will protect us in this case from a 
>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs 
>> after we already parked it. The problem here is we need the 
>> drm_sched_job to access the private data for each client driver (see 
>> amdgpu_job_timedout for example). What about instead of peeking at 
>> the job to actually remove it from ring_mirror_list right there, go 
>> ahead with it through the reset routine, if it's signaled in the 
>> meanwhile that great - release it, otherwise put it back into 
>> ring_mirror_list in drm_sched_resubmit_jobs.
>>
>> Andrey
>>
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>
>>>> This why I asked for a trace with timer enabled, but since there is 
>>>> a finite number of places we touch the timer Emily can just put 
>>>> prints there. Also, I wonder if this temp fix helps her with the 
>>>> issue or not.
>>>>
>>>> Andrey
>>>>
>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>> The question is where do we rearm the timer for this problem to 
>>>>> occur?
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>
>>>>>> I was able to reproduce the crash by using the attached 
>>>>>> simulate_crash.patch - waiting on guilty job to signal in reset 
>>>>>> work and artificially rearming the timeout timer just before the 
>>>>>> check for !cancel_delayed_work(&sched->work_tdr)  in 
>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I 
>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>
>>>>>> basic_fix.patch handles this by testing whether another timer 
>>>>>> already armed ob this scheduler or is there a timeout work in 
>>>>>> execution right now (see documentation for work_busy) - 
>>>>>> obviously  this is not a full solution as this will not protect 
>>>>>> from races if for example there is immediate work scheduling such 
>>>>>> as in drm_sched_fault -  so we probably need to account for this 
>>>>>> by making drm_sched_cleanup_jobs (at least in the part where it 
>>>>>> iterates ring mirror list and frees jobs) and GPU reset really 
>>>>>> mutually exclusive and not like now.
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>> Hi Emily,
>>>>>>>
>>>>>>> you need to print which scheduler instance is freeing the jobs 
>>>>>>> and which one is triggering the reset. The TID and PID is 
>>>>>>> completely meaningless here since we are called from different 
>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>
>>>>>>> Apart from that I will look into this a bit deeper when I have 
>>>>>>> time.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>> Hi Christian,
>>>>>>>>     I add the follow print in function drm_sched_cleanup_jobs. 
>>>>>>>> From the log it shows that only use cancel_delayed_work could 
>>>>>>>> not avoid to free job when the sched is in reset. But don’t 
>>>>>>>> know exactly where it is wrong about the driver. Do you have 
>>>>>>>> any suggestion about this?
>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, 
>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>         /*
>>>>>>>>          * Don't destroy jobs while the timeout worker is 
>>>>>>>> running  OR thread
>>>>>>>>          * is being parked and hence assumed to not touch 
>>>>>>>> ring_mirror_list
>>>>>>>>          */
>>>>>>>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>                 return;
>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
>>>>>>>> current->tgid, current->pid);
>>>>>>>> Best wishes
>>>>>>>> Emily Deng
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring 
>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* 
>>>>>>>> Process information: process pid 0 thread pid 0, 
>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information: 
>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, 
>>>>>>>> pid:2253
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information: 
>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> >-----Original Message-----
>>>>>>>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>> >Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>> >To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>>>> ><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue 
>>>>>>>> for tdr
>>>>>>>> >
>>>>>>>> >Thinking more about this claim - we assume here that if 
>>>>>>>> cancel_delayed_work
>>>>>>>> >returned true it guarantees that timeout work is not running 
>>>>>>>> but, it merely
>>>>>>>> >means there was a pending timeout work which was removed from the
>>>>>>>> >workqueue before it's timer elapsed and so it didn't have a 
>>>>>>>> chance to be
>>>>>>>> >dequeued and executed, it doesn't cover already executing 
>>>>>>>> work. So there is a
>>>>>>>> >possibility where while timeout work started executing another 
>>>>>>>> timeout work
>>>>>>>> >already got enqueued (maybe through earlier cleanup jobs or 
>>>>>>>> through
>>>>>>>> >drm_sched_fault) and if at this point another 
>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>> >cancel_delayed_work(&sched->work_tdr) will return true even 
>>>>>>>> while there is a
>>>>>>>> >timeout job in progress.
>>>>>>>> >Unfortunately we cannot change cancel_delayed_work to
>>>>>>>> >cancel_delayed_work_sync to flush the timeout work as timeout 
>>>>>>>> work itself
>>>>>>>> >waits for schedule thread  to be parked again when calling 
>>>>>>>> park_thread.
>>>>>>>> >
>>>>>>>> >Andrey
>>>>>>>> >
>>>>>>>> >________________________________________
>>>>>>>> >From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on 
>>>>>>>> behalf of
>>>>>>>> >Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> >Sent: 08 November 2019 05:35:18
>>>>>>>> >To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue 
>>>>>>>> for tdr
>>>>>>>> >
>>>>>>>> >Hi Emily,
>>>>>>>> >
>>>>>>>> >exactly that can't happen. See here:
>>>>>>>> >
>>>>>>>> >>         /* Don't destroy jobs while the timeout worker is 
>>>>>>>> running */
>>>>>>>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>> >> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>> >>                 return NULL;
>>>>>>>> >
>>>>>>>> >We never free jobs while the timeout working is running to 
>>>>>>>> prevent exactly
>>>>>>>> >that issue.
>>>>>>>> >
>>>>>>>> >Regards,
>>>>>>>> >Christian.
>>>>>>>> >
>>>>>>>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>> >> Hi Christian,
>>>>>>>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>>>>>> >amdgpu_device_gpu_recover. I mean the main scheduler free the 
>>>>>>>> jobs while
>>>>>>>> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>>>>>> >>
>>>>>>>> >> Best wishes
>>>>>>>> >> Emily Deng
>>>>>>>> >>
>>>>>>>> >>
>>>>>>>> >>
>>>>>>>> >>> -----Original Message-----
>>>>>>>> >>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> >>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>> >>> To: Deng, Emily <Emily.Deng@amd.com>; 
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue 
>>>>>>>> for tdr
>>>>>>>> >>>
>>>>>>>> >>> Hi Emily,
>>>>>>>> >>>
>>>>>>>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>>>> >>>
>>>>>>>> >>> When it's not the scheduler we shouldn't have a guilty job 
>>>>>>>> in the first place.
>>>>>>>> >>>
>>>>>>>> >>> Regards,
>>>>>>>> >>> Christian.
>>>>>>>> >>>
>>>>>>>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>> >>>> Hi Chrisitan,
>>>>>>>> >>>>        No, I am with the new branch and also has the 
>>>>>>>> patch. Even it
>>>>>>>> >>>> are freed by
>>>>>>>> >>> main scheduler, how we could avoid main scheduler to free 
>>>>>>>> jobs while
>>>>>>>> >>> enter to function amdgpu_device_gpu_recover?
>>>>>>>> >>>> Best wishes
>>>>>>>> >>>> Emily Deng
>>>>>>>> >>>>
>>>>>>>> >>>>
>>>>>>>> >>>>
>>>>>>>> >>>>> -----Original Message-----
>>>>>>>> >>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>> >>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>> >gfx@lists.freedesktop.org
>>>>>>>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>> issue for tdr
>>>>>>>> >>>>>
>>>>>>>> >>>>> Hi Emily,
>>>>>>>> >>>>>
>>>>>>>> >>>>> in this case you are on an old code branch.
>>>>>>>> >>>>>
>>>>>>>> >>>>> Jobs are freed now by the main scheduler thread and only 
>>>>>>>> if no
>>>>>>>> >>>>> timeout handler is running.
>>>>>>>> >>>>>
>>>>>>>> >>>>> See this patch here:
>>>>>>>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>> >>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>> >>>>>>
>>>>>>>> >>>>>>       drm/scheduler: rework job destruction
>>>>>>>> >>>>> Regards,
>>>>>>>> >>>>> Christian.
>>>>>>>> >>>>>
>>>>>>>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>> >>>>>> Hi Christian,
>>>>>>>> >>>>>>         Please refer to follow log, when it enter to
>>>>>>>> >>>>>> amdgpu_device_gpu_recover
>>>>>>>> >>>>> function, the bad job 000000005086879e is freeing in 
>>>>>>>> function
>>>>>>>> >>>>> amdgpu_job_free_cb  at the same time, because of the 
>>>>>>>> hardware fence
>>>>>>>> >>> signal.
>>>>>>>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>>>> >>>>> s_fence is already freed, but job is not freed in time. 
>>>>>>>> Then this issue
>>>>>>>> >occurs.
>>>>>>>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] 
>>>>>>>> *ERROR* ring
>>>>>>>> >>> sdma0
>>>>>>>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>>> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
>>>>>>>> information:
>>>>>>>> >>>>> process  pid 0 thread pid 0, s_job:000000005086879e [  
>>>>>>>> 449.794163]
>>>>>>>> >>>>> amdgpu
>>>>>>>> >>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process 
>>>>>>>> information:
>>>>>>>> >>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [ 
>>>>>>>> 449.794221]
>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  
>>>>>>>> pid 0
>>>>>>>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  
>>>>>>>> pid 0
>>>>>>>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  
>>>>>>>> pid 0
>>>>>>>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  
>>>>>>>> pid 0
>>>>>>>> >>>>>> thread pid 0,
>>>>>>>> >>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>> >>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>> >>>>> information: process pid 0 thread  pid 0, 
>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer 
>>>>>>>> dereference
>>>>>>>> >>>>> at
>>>>>>>> >>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0 [  449.801040] 
>>>>>>>> Oops:
>>>>>>>> >>>>> 0000 [#1] SMP PTI
>>>>>>>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: 
>>>>>>>> G OE
>>>>>>>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + 
>>>>>>>> PIIX,
>>>>>>>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>>> >>>>>> 449.803488]
>>>>>>>> >>> RIP:
>>>>>>>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 
>>>>>>>> 56 ff ff
>>>>>>>> >>>>>> ff
>>>>>>>> >>>>>> 45 85 e4 0f
>>>>>>>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 
>>>>>>>> 8b 40 10
>>>>>>>> >>>>> <48> 8b
>>>>>>>> >>> 98
>>>>>>>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 
>>>>>>>> 48 a8 01
>>>>>>>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 
>>>>>>>> 00010286 [
>>>>>>>> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 
>>>>>>>> RCX:
>>>>>>>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 
>>>>>>>> 0000000000000000 [
>>>>>>>> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 
>>>>>>>> R12:
>>>>>>>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>> >>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000 
>>>>>>>> ES: 0000
>>>>>>>> >CR0:
>>>>>>>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0 CR3:
>>>>>>>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>>>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 
>>>>>>>> 0000000000000000 [
>>>>>>>> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>>>> >>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>>> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 
>>>>>>>> [amdgpu] [
>>>>>>>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 
>>>>>>>> [amd_sched] [
>>>>>>>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>>> >>>>>> worker_thread+0x34/0x410 [ 449.814728] 
>>>>>>>> kthread+0x121/0x140 [
>>>>>>>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  
>>>>>>>> 449.815374]  ?
>>>>>>>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>> >>>>>>
>>>>>>>> >>>>>>> -----Original Message-----
>>>>>>>> >>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>> >>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>> >>> gfx@lists.freedesktop.org
>>>>>>>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>> issue for
>>>>>>>> >>>>>>> tdr
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>> >>>>>>>> Sorry, please take your time.
>>>>>>>> >>>>>>> Have you seen my other response a bit below?
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>> I can't follow how it would be possible for 
>>>>>>>> job->s_fence to be
>>>>>>>> >>>>>>> NULL without the job also being freed.
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>> So it looks like this patch is just papering over some 
>>>>>>>> bigger issues.
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>> Regards,
>>>>>>>> >>>>>>> Christian.
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>>> Best wishes
>>>>>>>> >>>>>>>> Emily Deng
>>>>>>>> >>>>>>>>
>>>>>>>> >>>>>>>>
>>>>>>>> >>>>>>>>
>>>>>>>> >>>>>>>>> -----Original Message-----
>>>>>>>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>> >>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>> >>>>> gfx@lists.freedesktop.org
>>>>>>>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>> issue for
>>>>>>>> >>>>>>>>> tdr
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>> >>>>>>>>>> Ping.....
>>>>>>>> >>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>>> Best wishes
>>>>>>>> >>>>>>>>>> Emily Deng
>>>>>>>> >>>>>>>>>>
>>>>>>>> >>>>>>>>>>
>>>>>>>> >>>>>>>>>>
>>>>>>>> >>>>>>>>>>> -----Original Message-----
>>>>>>>> >>>>>>>>>>> From: amd-gfx 
>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>> >>> Behalf
>>>>>>>> >>>>>>>>>>> Of Deng, Emily
>>>>>>>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>> >>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>> pointer issue
>>>>>>>> >>>>>>>>>>> for tdr
>>>>>>>> >>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>> -----Original Message-----
>>>>>>>> >>>>>>>>>>>> From: Christian König 
>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>> >>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>> pointer issue
>>>>>>>> >>>>>>>>>>>> for tdr
>>>>>>>> >>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is 
>>>>>>>> freed.
>>>>>>>> >>>>>>>>>>>>> Then it will has null pointer in 
>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job 
>>>>>>>> is destroyed.
>>>>>>>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. 
>>>>>>>> But in one
>>>>>>>> >>>>>>>>>>> case, when it enter into the 
>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, 
>>>>>>>> it will
>>>>>>>> >>>>>>>>>>> go to free
>>>>>>>> >>>>> job.
>>>>>>>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is 
>>>>>>>> faster. At
>>>>>>>> >>>>>>>>>>> that time, job is not freed, but s_fence is already 
>>>>>>>> NULL.
>>>>>>>> >>>>>>>>> No, that case can't happen. See here:
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>> >>>>>>>>>>
>>>>>>>> >>>>>>>>>> amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>> >>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>> >>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>> >>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>> >>>>>>>>>> kfree(job);
>>>>>>>> >>>>>>>>> The job itself is freed up directly after freeing the 
>>>>>>>> reference
>>>>>>>> >>>>>>>>> to the
>>>>>>>> >>>>> s_fence.
>>>>>>>> >>>>>>>>> So you are just papering over a much bigger problem 
>>>>>>>> here. This
>>>>>>>> >>>>>>>>> patch is a clear NAK.
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>> Regards,
>>>>>>>> >>>>>>>>> Christian.
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>>>>> When you see a job without an s_fence then that 
>>>>>>>> means the
>>>>>>>> >>>>>>>>>>>> problem is somewhere else.
>>>>>>>> >>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>> Regards,
>>>>>>>> >>>>>>>>>>>> Christian.
>>>>>>>> >>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>> >>>>>>>>>>>>> ---
>>>>>>>> >>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>> >>>>>>>>>>>>> drivers/gpu/drm/scheduler/sched_main.c     | 11 
>>>>>>>> ++++++---
>>>>>>>> >--
>>>>>>>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 
>>>>>>>> deletions(-)
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> diff --git 
>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>> >>> amdgpu_device_gpu_recover(struct
>>>>>>>> >>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>> >>>>>>>>>>>>>            *
>>>>>>>> >>>>>>>>>>>>>            * job->base holds a reference to 
>>>>>>>> parent fence
>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>> >>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>> >>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>> >>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>> >>>>>>> &&
>>>>>>>> >>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>> >>>>>>>>>>>>> job_signaled = true;
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>> >drm_sched_increase_karma(struct
>>>>>>>> >>>>>>>>>>>> drm_sched_job
>>>>>>>> >>>>>>>>>>>>> *bad)
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe(entity,
>>>>>>>> >>>>>>>>>>>>> tmp,
>>>>>>>> >>> &rq-
>>>>>>>> >>>>>>>> entities,
>>>>>>>> >>>>>>>>>>>> list) {
>>>>>>>> >>>>>>>>>>>>> -                          if 
>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>> >>>>>>> ==
>>>>>>>> >>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>> >>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>> >>>>>>>>>>>>> scheduled.context ==
>>>>>>>> >>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>> >>>>>>>>>>>>> if
>>>>>>>> >>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>> >>>>>>>> karma) >
>>>>>>>> >>>>>>>>>>>>> bad->sched-
>>>>>>>> >>>> hang_limit)
>>>>>>>> >>>>>>>>>>>>> if
>>>>>>>> >>>>>>>>>>>>> (entity-
>>>>>>>> >>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>> >>>>>>> drm_gpu_scheduler
>>>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>> >>>>>>>>>>>>>            * This iteration is thread safe as 
>>>>>>>> sched thread
>>>>>>>> >>>>>>>>>>>>> is
>>>>>>>> >>> stopped.
>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>> >>>>>>>>>>>>> +          if (s_job->s_fence && 
>>>>>>>> s_job->s_fence->parent &&
>>>>>>>> >>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>> >>>> s_fence-
>>>>>>>> >>>>>>>> parent,
>>>>>>>> >>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>> >>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>> >>> @@ -
>>>>>>>> >>>>>>> 395,7
>>>>>>>> >>>>>>>>>>> +395,8 @@ void
>>>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>> >>>>>>>>>>>>> *
>>>>>>>> >>>>>>>>>>>>> * Job is still alive so fence
>>>>>>>> >>>>>>>>>>>>> refcount at
>>>>>>>> >>> least 1
>>>>>>>> >>>>>>>>>>>>> */
>>>>>>>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>> >>>>>>> false);
>>>>>>>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>>>>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>> >>>>>>>> finished,
>>>>>>>> >>>>>>>>>>>> false);
>>>>>>>> >>>>>>>>>>>>> /*
>>>>>>>> >>>>>>>>>>>>> * We must keep bad job alive
>>>>>>>> >>>>>>>>>>>>> for later
>>>>>>>> >>> use
>>>>>>>> >>>>>>> during @@
>>>>>>>> >>>>>>>>>>>> -438,7
>>>>>>>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct 
>>>>>>>> drm_gpu_scheduler
>>>>>>>> >>>>> *sched,
>>>>>>>> >>>>>>>>>>>>> +bool
>>>>>>>> >>>>>>>>>>>> full_recovery)
>>>>>>>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>> >>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>> >>>>>>>>>>>>> node)
>>>>>>>> >>>>>>>>>>>> {
>>>>>>>> >>>>>>>>>>>>> -          struct dma_fence *fence = 
>>>>>>>> s_job->s_fence->parent;
>>>>>>>> >>>>>>>>>>>>> +          struct dma_fence *fence = 
>>>>>>>> s_job->s_fence ?
>>>>>>>> >>>>>>>>>>>>> + s_job-
>>>>>>>> >>>>>>>> s_fence-
>>>>>>>> >>>>>>>>>>>>> parent :
>>>>>>>> >>>>>>>>>>>>> +NULL;
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>> _______________________________________________
>>>>>>>> >>>>>>>>>>> amd-gfx mailing list
>>>>>>>> >>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> >>>>>>>>>>> 
>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>> >
>>>>>>>> >_______________________________________________
>>>>>>>> >amd-gfx mailing list
>>>>>>>> >amd-gfx@lists.freedesktop.org
>>>>>>>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>
>>>>>>
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>
>>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-14 15:53                                                                                         ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-14 15:53 UTC (permalink / raw)
  To: christian.koenig, Deng, Emily, amd-gfx

ok

Andrey

On 11/14/19 3:12 AM, Christian König wrote:
>> What about instead of peeking at the job to actually remove it from 
>> ring_mirror_list right there,
> Also an interesting idea. We would need to protect the mirror list 
> with a lock again, but that should be the lesser evil.
>
> Maybe prototype that and see if it works or not.
>
> Regards,
> Christian.
>
> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>
>>
>> On 11/13/19 9:20 AM, Christian König wrote:
>>> Another more fundamental question: Could we get rid of the timeout 
>>> job at all?
>>
>>
>> There are other stuff there besides picking the first unfinished job 
>> which is common for all the drivers - such as freeing guilty signaled 
>> job and rearming the timeout work timer.
>>
>>
>>>
>>> I mean we used to give this as parameter to the scheduler callback 
>>> because we had the timeout worker in the job, but that is no longer 
>>> the case.
>>>
>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>         job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>                                        struct drm_sched_job, node);
>>>
>>> Why don't we just remove that here and only get the first job after 
>>> we have stopped the scheduler?
>>
>>
>> Should be ok since we have the extra check for __kthread_should_park 
>> in drm_sched_cleanup_jobs which will protect us in this case from a 
>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs 
>> after we already parked it. The problem here is we need the 
>> drm_sched_job to access the private data for each client driver (see 
>> amdgpu_job_timedout for example). What about instead of peeking at 
>> the job to actually remove it from ring_mirror_list right there, go 
>> ahead with it through the reset routine, if it's signaled in the 
>> meanwhile that great - release it, otherwise put it back into 
>> ring_mirror_list in drm_sched_resubmit_jobs.
>>
>> Andrey
>>
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>
>>>> This why I asked for a trace with timer enabled, but since there is 
>>>> a finite number of places we touch the timer Emily can just put 
>>>> prints there. Also, I wonder if this temp fix helps her with the 
>>>> issue or not.
>>>>
>>>> Andrey
>>>>
>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>> The question is where do we rearm the timer for this problem to 
>>>>> occur?
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>
>>>>>> I was able to reproduce the crash by using the attached 
>>>>>> simulate_crash.patch - waiting on guilty job to signal in reset 
>>>>>> work and artificially rearming the timeout timer just before the 
>>>>>> check for !cancel_delayed_work(&sched->work_tdr)  in 
>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I 
>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>
>>>>>> basic_fix.patch handles this by testing whether another timer 
>>>>>> already armed ob this scheduler or is there a timeout work in 
>>>>>> execution right now (see documentation for work_busy) - 
>>>>>> obviously  this is not a full solution as this will not protect 
>>>>>> from races if for example there is immediate work scheduling such 
>>>>>> as in drm_sched_fault -  so we probably need to account for this 
>>>>>> by making drm_sched_cleanup_jobs (at least in the part where it 
>>>>>> iterates ring mirror list and frees jobs) and GPU reset really 
>>>>>> mutually exclusive and not like now.
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>> Hi Emily,
>>>>>>>
>>>>>>> you need to print which scheduler instance is freeing the jobs 
>>>>>>> and which one is triggering the reset. The TID and PID is 
>>>>>>> completely meaningless here since we are called from different 
>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>
>>>>>>> Apart from that I will look into this a bit deeper when I have 
>>>>>>> time.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>> Hi Christian,
>>>>>>>>     I add the follow print in function drm_sched_cleanup_jobs. 
>>>>>>>> From the log it shows that only use cancel_delayed_work could 
>>>>>>>> not avoid to free job when the sched is in reset. But don’t 
>>>>>>>> know exactly where it is wrong about the driver. Do you have 
>>>>>>>> any suggestion about this?
>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, 
>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>         /*
>>>>>>>>          * Don't destroy jobs while the timeout worker is 
>>>>>>>> running  OR thread
>>>>>>>>          * is being parked and hence assumed to not touch 
>>>>>>>> ring_mirror_list
>>>>>>>>          */
>>>>>>>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>                 return;
>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
>>>>>>>> current->tgid, current->pid);
>>>>>>>> Best wishes
>>>>>>>> Emily Deng
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring 
>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* 
>>>>>>>> Process information: process pid 0 thread pid 0, 
>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information: 
>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, 
>>>>>>>> pid:2253
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information: 
>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> >-----Original Message-----
>>>>>>>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>> >Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>> >To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>>>> ><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue 
>>>>>>>> for tdr
>>>>>>>> >
>>>>>>>> >Thinking more about this claim - we assume here that if 
>>>>>>>> cancel_delayed_work
>>>>>>>> >returned true it guarantees that timeout work is not running 
>>>>>>>> but, it merely
>>>>>>>> >means there was a pending timeout work which was removed from the
>>>>>>>> >workqueue before it's timer elapsed and so it didn't have a 
>>>>>>>> chance to be
>>>>>>>> >dequeued and executed, it doesn't cover already executing 
>>>>>>>> work. So there is a
>>>>>>>> >possibility where while timeout work started executing another 
>>>>>>>> timeout work
>>>>>>>> >already got enqueued (maybe through earlier cleanup jobs or 
>>>>>>>> through
>>>>>>>> >drm_sched_fault) and if at this point another 
>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>> >cancel_delayed_work(&sched->work_tdr) will return true even 
>>>>>>>> while there is a
>>>>>>>> >timeout job in progress.
>>>>>>>> >Unfortunately we cannot change cancel_delayed_work to
>>>>>>>> >cancel_delayed_work_sync to flush the timeout work as timeout 
>>>>>>>> work itself
>>>>>>>> >waits for schedule thread  to be parked again when calling 
>>>>>>>> park_thread.
>>>>>>>> >
>>>>>>>> >Andrey
>>>>>>>> >
>>>>>>>> >________________________________________
>>>>>>>> >From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on 
>>>>>>>> behalf of
>>>>>>>> >Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> >Sent: 08 November 2019 05:35:18
>>>>>>>> >To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue 
>>>>>>>> for tdr
>>>>>>>> >
>>>>>>>> >Hi Emily,
>>>>>>>> >
>>>>>>>> >exactly that can't happen. See here:
>>>>>>>> >
>>>>>>>> >>         /* Don't destroy jobs while the timeout worker is 
>>>>>>>> running */
>>>>>>>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>> >> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>> >>                 return NULL;
>>>>>>>> >
>>>>>>>> >We never free jobs while the timeout working is running to 
>>>>>>>> prevent exactly
>>>>>>>> >that issue.
>>>>>>>> >
>>>>>>>> >Regards,
>>>>>>>> >Christian.
>>>>>>>> >
>>>>>>>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>> >> Hi Christian,
>>>>>>>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>>>>>> >amdgpu_device_gpu_recover. I mean the main scheduler free the 
>>>>>>>> jobs while
>>>>>>>> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>>>>>> >>
>>>>>>>> >> Best wishes
>>>>>>>> >> Emily Deng
>>>>>>>> >>
>>>>>>>> >>
>>>>>>>> >>
>>>>>>>> >>> -----Original Message-----
>>>>>>>> >>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> >>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>> >>> To: Deng, Emily <Emily.Deng@amd.com>; 
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue 
>>>>>>>> for tdr
>>>>>>>> >>>
>>>>>>>> >>> Hi Emily,
>>>>>>>> >>>
>>>>>>>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>>>> >>>
>>>>>>>> >>> When it's not the scheduler we shouldn't have a guilty job 
>>>>>>>> in the first place.
>>>>>>>> >>>
>>>>>>>> >>> Regards,
>>>>>>>> >>> Christian.
>>>>>>>> >>>
>>>>>>>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>> >>>> Hi Chrisitan,
>>>>>>>> >>>>        No, I am with the new branch and also has the 
>>>>>>>> patch. Even it
>>>>>>>> >>>> are freed by
>>>>>>>> >>> main scheduler, how we could avoid main scheduler to free 
>>>>>>>> jobs while
>>>>>>>> >>> enter to function amdgpu_device_gpu_recover?
>>>>>>>> >>>> Best wishes
>>>>>>>> >>>> Emily Deng
>>>>>>>> >>>>
>>>>>>>> >>>>
>>>>>>>> >>>>
>>>>>>>> >>>>> -----Original Message-----
>>>>>>>> >>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>> >>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>> >gfx@lists.freedesktop.org
>>>>>>>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>> issue for tdr
>>>>>>>> >>>>>
>>>>>>>> >>>>> Hi Emily,
>>>>>>>> >>>>>
>>>>>>>> >>>>> in this case you are on an old code branch.
>>>>>>>> >>>>>
>>>>>>>> >>>>> Jobs are freed now by the main scheduler thread and only 
>>>>>>>> if no
>>>>>>>> >>>>> timeout handler is running.
>>>>>>>> >>>>>
>>>>>>>> >>>>> See this patch here:
>>>>>>>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>> >>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>> >>>>>>
>>>>>>>> >>>>>>       drm/scheduler: rework job destruction
>>>>>>>> >>>>> Regards,
>>>>>>>> >>>>> Christian.
>>>>>>>> >>>>>
>>>>>>>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>> >>>>>> Hi Christian,
>>>>>>>> >>>>>>         Please refer to follow log, when it enter to
>>>>>>>> >>>>>> amdgpu_device_gpu_recover
>>>>>>>> >>>>> function, the bad job 000000005086879e is freeing in 
>>>>>>>> function
>>>>>>>> >>>>> amdgpu_job_free_cb  at the same time, because of the 
>>>>>>>> hardware fence
>>>>>>>> >>> signal.
>>>>>>>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>>>> >>>>> s_fence is already freed, but job is not freed in time. 
>>>>>>>> Then this issue
>>>>>>>> >occurs.
>>>>>>>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] 
>>>>>>>> *ERROR* ring
>>>>>>>> >>> sdma0
>>>>>>>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>>> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
>>>>>>>> information:
>>>>>>>> >>>>> process  pid 0 thread pid 0, s_job:000000005086879e [  
>>>>>>>> 449.794163]
>>>>>>>> >>>>> amdgpu
>>>>>>>> >>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process 
>>>>>>>> information:
>>>>>>>> >>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [ 
>>>>>>>> 449.794221]
>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  
>>>>>>>> pid 0
>>>>>>>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  
>>>>>>>> pid 0
>>>>>>>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  
>>>>>>>> pid 0
>>>>>>>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  
>>>>>>>> pid 0
>>>>>>>> >>>>>> thread pid 0,
>>>>>>>> >>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>> >>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>> >>>>> information: process pid 0 thread  pid 0, 
>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer 
>>>>>>>> dereference
>>>>>>>> >>>>> at
>>>>>>>> >>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0 [  449.801040] 
>>>>>>>> Oops:
>>>>>>>> >>>>> 0000 [#1] SMP PTI
>>>>>>>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: 
>>>>>>>> G OE
>>>>>>>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + 
>>>>>>>> PIIX,
>>>>>>>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>>> >>>>>> 449.803488]
>>>>>>>> >>> RIP:
>>>>>>>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 
>>>>>>>> 56 ff ff
>>>>>>>> >>>>>> ff
>>>>>>>> >>>>>> 45 85 e4 0f
>>>>>>>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 
>>>>>>>> 8b 40 10
>>>>>>>> >>>>> <48> 8b
>>>>>>>> >>> 98
>>>>>>>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 
>>>>>>>> 48 a8 01
>>>>>>>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 
>>>>>>>> 00010286 [
>>>>>>>> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 
>>>>>>>> RCX:
>>>>>>>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 
>>>>>>>> 0000000000000000 [
>>>>>>>> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 
>>>>>>>> R12:
>>>>>>>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>> >>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000 
>>>>>>>> ES: 0000
>>>>>>>> >CR0:
>>>>>>>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0 CR3:
>>>>>>>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>>>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 
>>>>>>>> 0000000000000000 [
>>>>>>>> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>>>> >>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>>> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 
>>>>>>>> [amdgpu] [
>>>>>>>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 
>>>>>>>> [amd_sched] [
>>>>>>>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>>> >>>>>> worker_thread+0x34/0x410 [ 449.814728] 
>>>>>>>> kthread+0x121/0x140 [
>>>>>>>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  
>>>>>>>> 449.815374]  ?
>>>>>>>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>> >>>>>>
>>>>>>>> >>>>>>> -----Original Message-----
>>>>>>>> >>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>> >>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>> >>> gfx@lists.freedesktop.org
>>>>>>>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>> issue for
>>>>>>>> >>>>>>> tdr
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>> >>>>>>>> Sorry, please take your time.
>>>>>>>> >>>>>>> Have you seen my other response a bit below?
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>> I can't follow how it would be possible for 
>>>>>>>> job->s_fence to be
>>>>>>>> >>>>>>> NULL without the job also being freed.
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>> So it looks like this patch is just papering over some 
>>>>>>>> bigger issues.
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>> Regards,
>>>>>>>> >>>>>>> Christian.
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>>> Best wishes
>>>>>>>> >>>>>>>> Emily Deng
>>>>>>>> >>>>>>>>
>>>>>>>> >>>>>>>>
>>>>>>>> >>>>>>>>
>>>>>>>> >>>>>>>>> -----Original Message-----
>>>>>>>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>> >>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>> >>>>> gfx@lists.freedesktop.org
>>>>>>>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>> issue for
>>>>>>>> >>>>>>>>> tdr
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>> >>>>>>>>>> Ping.....
>>>>>>>> >>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>>> Best wishes
>>>>>>>> >>>>>>>>>> Emily Deng
>>>>>>>> >>>>>>>>>>
>>>>>>>> >>>>>>>>>>
>>>>>>>> >>>>>>>>>>
>>>>>>>> >>>>>>>>>>> -----Original Message-----
>>>>>>>> >>>>>>>>>>> From: amd-gfx 
>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>> >>> Behalf
>>>>>>>> >>>>>>>>>>> Of Deng, Emily
>>>>>>>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>> >>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>> pointer issue
>>>>>>>> >>>>>>>>>>> for tdr
>>>>>>>> >>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>> -----Original Message-----
>>>>>>>> >>>>>>>>>>>> From: Christian König 
>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>> >>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>> pointer issue
>>>>>>>> >>>>>>>>>>>> for tdr
>>>>>>>> >>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is 
>>>>>>>> freed.
>>>>>>>> >>>>>>>>>>>>> Then it will has null pointer in 
>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job 
>>>>>>>> is destroyed.
>>>>>>>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. 
>>>>>>>> But in one
>>>>>>>> >>>>>>>>>>> case, when it enter into the 
>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, 
>>>>>>>> it will
>>>>>>>> >>>>>>>>>>> go to free
>>>>>>>> >>>>> job.
>>>>>>>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is 
>>>>>>>> faster. At
>>>>>>>> >>>>>>>>>>> that time, job is not freed, but s_fence is already 
>>>>>>>> NULL.
>>>>>>>> >>>>>>>>> No, that case can't happen. See here:
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>> >>>>>>>>>>
>>>>>>>> >>>>>>>>>> amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>> >>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>> >>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>> >>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>> >>>>>>>>>> kfree(job);
>>>>>>>> >>>>>>>>> The job itself is freed up directly after freeing the 
>>>>>>>> reference
>>>>>>>> >>>>>>>>> to the
>>>>>>>> >>>>> s_fence.
>>>>>>>> >>>>>>>>> So you are just papering over a much bigger problem 
>>>>>>>> here. This
>>>>>>>> >>>>>>>>> patch is a clear NAK.
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>> Regards,
>>>>>>>> >>>>>>>>> Christian.
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>>>>> When you see a job without an s_fence then that 
>>>>>>>> means the
>>>>>>>> >>>>>>>>>>>> problem is somewhere else.
>>>>>>>> >>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>> Regards,
>>>>>>>> >>>>>>>>>>>> Christian.
>>>>>>>> >>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>> >>>>>>>>>>>>> ---
>>>>>>>> >>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>> >>>>>>>>>>>>> drivers/gpu/drm/scheduler/sched_main.c     | 11 
>>>>>>>> ++++++---
>>>>>>>> >--
>>>>>>>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 
>>>>>>>> deletions(-)
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> diff --git 
>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>> >>> amdgpu_device_gpu_recover(struct
>>>>>>>> >>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>> >>>>>>>>>>>>>            *
>>>>>>>> >>>>>>>>>>>>>            * job->base holds a reference to 
>>>>>>>> parent fence
>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>> >>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>> >>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>> >>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>> >>>>>>> &&
>>>>>>>> >>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>> >>>>>>>>>>>>> job_signaled = true;
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>> >drm_sched_increase_karma(struct
>>>>>>>> >>>>>>>>>>>> drm_sched_job
>>>>>>>> >>>>>>>>>>>>> *bad)
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe(entity,
>>>>>>>> >>>>>>>>>>>>> tmp,
>>>>>>>> >>> &rq-
>>>>>>>> >>>>>>>> entities,
>>>>>>>> >>>>>>>>>>>> list) {
>>>>>>>> >>>>>>>>>>>>> -                          if 
>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>> >>>>>>> ==
>>>>>>>> >>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>> >>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>> >>>>>>>>>>>>> scheduled.context ==
>>>>>>>> >>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>> >>>>>>>>>>>>> if
>>>>>>>> >>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>> >>>>>>>> karma) >
>>>>>>>> >>>>>>>>>>>>> bad->sched-
>>>>>>>> >>>> hang_limit)
>>>>>>>> >>>>>>>>>>>>> if
>>>>>>>> >>>>>>>>>>>>> (entity-
>>>>>>>> >>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>> >>>>>>> drm_gpu_scheduler
>>>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>> >>>>>>>>>>>>>            * This iteration is thread safe as 
>>>>>>>> sched thread
>>>>>>>> >>>>>>>>>>>>> is
>>>>>>>> >>> stopped.
>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>> >>>>>>>>>>>>> +          if (s_job->s_fence && 
>>>>>>>> s_job->s_fence->parent &&
>>>>>>>> >>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>> >>>> s_fence-
>>>>>>>> >>>>>>>> parent,
>>>>>>>> >>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>> >>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>> >>> @@ -
>>>>>>>> >>>>>>> 395,7
>>>>>>>> >>>>>>>>>>> +395,8 @@ void
>>>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>> >>>>>>>>>>>>> *
>>>>>>>> >>>>>>>>>>>>> * Job is still alive so fence
>>>>>>>> >>>>>>>>>>>>> refcount at
>>>>>>>> >>> least 1
>>>>>>>> >>>>>>>>>>>>> */
>>>>>>>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>> >>>>>>> false);
>>>>>>>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>>>>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>> >>>>>>>> finished,
>>>>>>>> >>>>>>>>>>>> false);
>>>>>>>> >>>>>>>>>>>>> /*
>>>>>>>> >>>>>>>>>>>>> * We must keep bad job alive
>>>>>>>> >>>>>>>>>>>>> for later
>>>>>>>> >>> use
>>>>>>>> >>>>>>> during @@
>>>>>>>> >>>>>>>>>>>> -438,7
>>>>>>>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct 
>>>>>>>> drm_gpu_scheduler
>>>>>>>> >>>>> *sched,
>>>>>>>> >>>>>>>>>>>>> +bool
>>>>>>>> >>>>>>>>>>>> full_recovery)
>>>>>>>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>> >>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>> >>>>>>>>>>>>> node)
>>>>>>>> >>>>>>>>>>>> {
>>>>>>>> >>>>>>>>>>>>> -          struct dma_fence *fence = 
>>>>>>>> s_job->s_fence->parent;
>>>>>>>> >>>>>>>>>>>>> +          struct dma_fence *fence = 
>>>>>>>> s_job->s_fence ?
>>>>>>>> >>>>>>>>>>>>> + s_job-
>>>>>>>> >>>>>>>> s_fence-
>>>>>>>> >>>>>>>>>>>>> parent :
>>>>>>>> >>>>>>>>>>>>> +NULL;
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>> _______________________________________________
>>>>>>>> >>>>>>>>>>> amd-gfx mailing list
>>>>>>>> >>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> >>>>>>>>>>> 
>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>> >
>>>>>>>> >_______________________________________________
>>>>>>>> >amd-gfx mailing list
>>>>>>>> >amd-gfx@lists.freedesktop.org
>>>>>>>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>
>>>>>>
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>
>>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-14 22:14                                                                                         ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-14 22:14 UTC (permalink / raw)
  To: christian.koenig-5C7GfCeVMHo, Deng, Emily,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

[-- Attachment #1: Type: text/plain, Size: 30035 bytes --]

Attached.

Emily - can you give it a try ?

Andrey

On 11/14/19 3:12 AM, Christian König wrote:
>> What about instead of peeking at the job to actually remove it from 
>> ring_mirror_list right there,
> Also an interesting idea. We would need to protect the mirror list 
> with a lock again, but that should be the lesser evil.
>
> Maybe prototype that and see if it works or not.
>
> Regards,
> Christian.
>
> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>
>>
>> On 11/13/19 9:20 AM, Christian König wrote:
>>> Another more fundamental question: Could we get rid of the timeout 
>>> job at all?
>>
>>
>> There are other stuff there besides picking the first unfinished job 
>> which is common for all the drivers - such as freeing guilty signaled 
>> job and rearming the timeout work timer.
>>
>>
>>>
>>> I mean we used to give this as parameter to the scheduler callback 
>>> because we had the timeout worker in the job, but that is no longer 
>>> the case.
>>>
>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>         job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>                                        struct drm_sched_job, node);
>>>
>>> Why don't we just remove that here and only get the first job after 
>>> we have stopped the scheduler?
>>
>>
>> Should be ok since we have the extra check for __kthread_should_park 
>> in drm_sched_cleanup_jobs which will protect us in this case from a 
>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs 
>> after we already parked it. The problem here is we need the 
>> drm_sched_job to access the private data for each client driver (see 
>> amdgpu_job_timedout for example). What about instead of peeking at 
>> the job to actually remove it from ring_mirror_list right there, go 
>> ahead with it through the reset routine, if it's signaled in the 
>> meanwhile that great - release it, otherwise put it back into 
>> ring_mirror_list in drm_sched_resubmit_jobs.
>>
>> Andrey
>>
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>
>>>> This why I asked for a trace with timer enabled, but since there is 
>>>> a finite number of places we touch the timer Emily can just put 
>>>> prints there. Also, I wonder if this temp fix helps her with the 
>>>> issue or not.
>>>>
>>>> Andrey
>>>>
>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>> The question is where do we rearm the timer for this problem to 
>>>>> occur?
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>
>>>>>> I was able to reproduce the crash by using the attached 
>>>>>> simulate_crash.patch - waiting on guilty job to signal in reset 
>>>>>> work and artificially rearming the timeout timer just before the 
>>>>>> check for !cancel_delayed_work(&sched->work_tdr)  in 
>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I 
>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>
>>>>>> basic_fix.patch handles this by testing whether another timer 
>>>>>> already armed ob this scheduler or is there a timeout work in 
>>>>>> execution right now (see documentation for work_busy) - 
>>>>>> obviously  this is not a full solution as this will not protect 
>>>>>> from races if for example there is immediate work scheduling such 
>>>>>> as in drm_sched_fault -  so we probably need to account for this 
>>>>>> by making drm_sched_cleanup_jobs (at least in the part where it 
>>>>>> iterates ring mirror list and frees jobs) and GPU reset really 
>>>>>> mutually exclusive and not like now.
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>> Hi Emily,
>>>>>>>
>>>>>>> you need to print which scheduler instance is freeing the jobs 
>>>>>>> and which one is triggering the reset. The TID and PID is 
>>>>>>> completely meaningless here since we are called from different 
>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>
>>>>>>> Apart from that I will look into this a bit deeper when I have 
>>>>>>> time.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>> Hi Christian,
>>>>>>>>     I add the follow print in function drm_sched_cleanup_jobs. 
>>>>>>>> From the log it shows that only use cancel_delayed_work could 
>>>>>>>> not avoid to free job when the sched is in reset. But don’t 
>>>>>>>> know exactly where it is wrong about the driver. Do you have 
>>>>>>>> any suggestion about this?
>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, 
>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>         /*
>>>>>>>>          * Don't destroy jobs while the timeout worker is 
>>>>>>>> running  OR thread
>>>>>>>>          * is being parked and hence assumed to not touch 
>>>>>>>> ring_mirror_list
>>>>>>>>          */
>>>>>>>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>                 return;
>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
>>>>>>>> current->tgid, current->pid);
>>>>>>>> Best wishes
>>>>>>>> Emily Deng
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring 
>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* 
>>>>>>>> Process information: process pid 0 thread pid 0, 
>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information: 
>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, 
>>>>>>>> pid:2253
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information: 
>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> >-----Original Message-----
>>>>>>>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
>>>>>>>> >Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>> >To: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>; Deng, Emily
>>>>>>>> ><Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue 
>>>>>>>> for tdr
>>>>>>>> >
>>>>>>>> >Thinking more about this claim - we assume here that if 
>>>>>>>> cancel_delayed_work
>>>>>>>> >returned true it guarantees that timeout work is not running 
>>>>>>>> but, it merely
>>>>>>>> >means there was a pending timeout work which was removed from the
>>>>>>>> >workqueue before it's timer elapsed and so it didn't have a 
>>>>>>>> chance to be
>>>>>>>> >dequeued and executed, it doesn't cover already executing 
>>>>>>>> work. So there is a
>>>>>>>> >possibility where while timeout work started executing another 
>>>>>>>> timeout work
>>>>>>>> >already got enqueued (maybe through earlier cleanup jobs or 
>>>>>>>> through
>>>>>>>> >drm_sched_fault) and if at this point another 
>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>> >cancel_delayed_work(&sched->work_tdr) will return true even 
>>>>>>>> while there is a
>>>>>>>> >timeout job in progress.
>>>>>>>> >Unfortunately we cannot change cancel_delayed_work to
>>>>>>>> >cancel_delayed_work_sync to flush the timeout work as timeout 
>>>>>>>> work itself
>>>>>>>> >waits for schedule thread  to be parked again when calling 
>>>>>>>> park_thread.
>>>>>>>> >
>>>>>>>> >Andrey
>>>>>>>> >
>>>>>>>> >________________________________________
>>>>>>>> >From: amd-gfx <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> on 
>>>>>>>> behalf of
>>>>>>>> >Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>>>>> >Sent: 08 November 2019 05:35:18
>>>>>>>> >To: Deng, Emily; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue 
>>>>>>>> for tdr
>>>>>>>> >
>>>>>>>> >Hi Emily,
>>>>>>>> >
>>>>>>>> >exactly that can't happen. See here:
>>>>>>>> >
>>>>>>>> >>         /* Don't destroy jobs while the timeout worker is 
>>>>>>>> running */
>>>>>>>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>> >> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>> >>                 return NULL;
>>>>>>>> >
>>>>>>>> >We never free jobs while the timeout working is running to 
>>>>>>>> prevent exactly
>>>>>>>> >that issue.
>>>>>>>> >
>>>>>>>> >Regards,
>>>>>>>> >Christian.
>>>>>>>> >
>>>>>>>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>> >> Hi Christian,
>>>>>>>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>>>>>> >amdgpu_device_gpu_recover. I mean the main scheduler free the 
>>>>>>>> jobs while
>>>>>>>> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>>>>>> >>
>>>>>>>> >> Best wishes
>>>>>>>> >> Emily Deng
>>>>>>>> >>
>>>>>>>> >>
>>>>>>>> >>
>>>>>>>> >>> -----Original Message-----
>>>>>>>> >>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>>>>> >>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>> >>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; 
>>>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>>>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue 
>>>>>>>> for tdr
>>>>>>>> >>>
>>>>>>>> >>> Hi Emily,
>>>>>>>> >>>
>>>>>>>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>>>> >>>
>>>>>>>> >>> When it's not the scheduler we shouldn't have a guilty job 
>>>>>>>> in the first place.
>>>>>>>> >>>
>>>>>>>> >>> Regards,
>>>>>>>> >>> Christian.
>>>>>>>> >>>
>>>>>>>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>> >>>> Hi Chrisitan,
>>>>>>>> >>>>        No, I am with the new branch and also has the 
>>>>>>>> patch. Even it
>>>>>>>> >>>> are freed by
>>>>>>>> >>> main scheduler, how we could avoid main scheduler to free 
>>>>>>>> jobs while
>>>>>>>> >>> enter to function amdgpu_device_gpu_recover?
>>>>>>>> >>>> Best wishes
>>>>>>>> >>>> Emily Deng
>>>>>>>> >>>>
>>>>>>>> >>>>
>>>>>>>> >>>>
>>>>>>>> >>>>> -----Original Message-----
>>>>>>>> >>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>>>>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>> >>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>>>>>>>> >gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>>>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>> issue for tdr
>>>>>>>> >>>>>
>>>>>>>> >>>>> Hi Emily,
>>>>>>>> >>>>>
>>>>>>>> >>>>> in this case you are on an old code branch.
>>>>>>>> >>>>>
>>>>>>>> >>>>> Jobs are freed now by the main scheduler thread and only 
>>>>>>>> if no
>>>>>>>> >>>>> timeout handler is running.
>>>>>>>> >>>>>
>>>>>>>> >>>>> See this patch here:
>>>>>>>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>> >>>>>> Author: Christian König <christian.koenig-5C7GfCeVMHo@public.gmane.org>
>>>>>>>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>> >>>>>>
>>>>>>>> >>>>>>       drm/scheduler: rework job destruction
>>>>>>>> >>>>> Regards,
>>>>>>>> >>>>> Christian.
>>>>>>>> >>>>>
>>>>>>>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>> >>>>>> Hi Christian,
>>>>>>>> >>>>>>         Please refer to follow log, when it enter to
>>>>>>>> >>>>>> amdgpu_device_gpu_recover
>>>>>>>> >>>>> function, the bad job 000000005086879e is freeing in 
>>>>>>>> function
>>>>>>>> >>>>> amdgpu_job_free_cb  at the same time, because of the 
>>>>>>>> hardware fence
>>>>>>>> >>> signal.
>>>>>>>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>>>> >>>>> s_fence is already freed, but job is not freed in time. 
>>>>>>>> Then this issue
>>>>>>>> >occurs.
>>>>>>>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] 
>>>>>>>> *ERROR* ring
>>>>>>>> >>> sdma0
>>>>>>>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>>> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
>>>>>>>> information:
>>>>>>>> >>>>> process  pid 0 thread pid 0, s_job:000000005086879e [  
>>>>>>>> 449.794163]
>>>>>>>> >>>>> amdgpu
>>>>>>>> >>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process 
>>>>>>>> information:
>>>>>>>> >>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [ 
>>>>>>>> 449.794221]
>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  
>>>>>>>> pid 0
>>>>>>>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  
>>>>>>>> pid 0
>>>>>>>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  
>>>>>>>> pid 0
>>>>>>>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  
>>>>>>>> pid 0
>>>>>>>> >>>>>> thread pid 0,
>>>>>>>> >>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>> >>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>> >>>>> information: process pid 0 thread  pid 0, 
>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer 
>>>>>>>> dereference
>>>>>>>> >>>>> at
>>>>>>>> >>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0 [  449.801040] 
>>>>>>>> Oops:
>>>>>>>> >>>>> 0000 [#1] SMP PTI
>>>>>>>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: 
>>>>>>>> G OE
>>>>>>>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + 
>>>>>>>> PIIX,
>>>>>>>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>>> >>>>>> 449.803488]
>>>>>>>> >>> RIP:
>>>>>>>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 
>>>>>>>> 56 ff ff
>>>>>>>> >>>>>> ff
>>>>>>>> >>>>>> 45 85 e4 0f
>>>>>>>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 
>>>>>>>> 8b 40 10
>>>>>>>> >>>>> <48> 8b
>>>>>>>> >>> 98
>>>>>>>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 
>>>>>>>> 48 a8 01
>>>>>>>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 
>>>>>>>> 00010286 [
>>>>>>>> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 
>>>>>>>> RCX:
>>>>>>>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 
>>>>>>>> 0000000000000000 [
>>>>>>>> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 
>>>>>>>> R12:
>>>>>>>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>> >>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000 
>>>>>>>> ES: 0000
>>>>>>>> >CR0:
>>>>>>>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0 CR3:
>>>>>>>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>>>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 
>>>>>>>> 0000000000000000 [
>>>>>>>> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>>>> >>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>>> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 
>>>>>>>> [amdgpu] [
>>>>>>>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 
>>>>>>>> [amd_sched] [
>>>>>>>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>>> >>>>>> worker_thread+0x34/0x410 [ 449.814728] 
>>>>>>>> kthread+0x121/0x140 [
>>>>>>>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  
>>>>>>>> 449.815374]  ?
>>>>>>>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>> >>>>>>
>>>>>>>> >>>>>>> -----Original Message-----
>>>>>>>> >>>>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>>>>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>> >>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>>>>>>>> >>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>>>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>> issue for
>>>>>>>> >>>>>>> tdr
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>> >>>>>>>> Sorry, please take your time.
>>>>>>>> >>>>>>> Have you seen my other response a bit below?
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>> I can't follow how it would be possible for 
>>>>>>>> job->s_fence to be
>>>>>>>> >>>>>>> NULL without the job also being freed.
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>> So it looks like this patch is just papering over some 
>>>>>>>> bigger issues.
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>> Regards,
>>>>>>>> >>>>>>> Christian.
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>>> Best wishes
>>>>>>>> >>>>>>>> Emily Deng
>>>>>>>> >>>>>>>>
>>>>>>>> >>>>>>>>
>>>>>>>> >>>>>>>>
>>>>>>>> >>>>>>>>> -----Original Message-----
>>>>>>>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>
>>>>>>>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>> >>>>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>; amd-
>>>>>>>> >>>>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>>>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>> issue for
>>>>>>>> >>>>>>>>> tdr
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>> >>>>>>>>>> Ping.....
>>>>>>>> >>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>>> Best wishes
>>>>>>>> >>>>>>>>>> Emily Deng
>>>>>>>> >>>>>>>>>>
>>>>>>>> >>>>>>>>>>
>>>>>>>> >>>>>>>>>>
>>>>>>>> >>>>>>>>>>> -----Original Message-----
>>>>>>>> >>>>>>>>>>> From: amd-gfx 
>>>>>>>> <amd-gfx-bounces-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org> On
>>>>>>>> >>> Behalf
>>>>>>>> >>>>>>>>>>> Of Deng, Emily
>>>>>>>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig-5C7GfCeVMHo@public.gmane.org>; amd-
>>>>>>>> >>>>>>>>>>> gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>>>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>> pointer issue
>>>>>>>> >>>>>>>>>>> for tdr
>>>>>>>> >>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>> -----Original Message-----
>>>>>>>> >>>>>>>>>>>> From: Christian König 
>>>>>>>> <ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>>>>>>>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng-5C7GfCeVMHo@public.gmane.org>;
>>>>>>>> >>>>>>>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>>>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>> pointer issue
>>>>>>>> >>>>>>>>>>>> for tdr
>>>>>>>> >>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is 
>>>>>>>> freed.
>>>>>>>> >>>>>>>>>>>>> Then it will has null pointer in 
>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job 
>>>>>>>> is destroyed.
>>>>>>>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. 
>>>>>>>> But in one
>>>>>>>> >>>>>>>>>>> case, when it enter into the 
>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, 
>>>>>>>> it will
>>>>>>>> >>>>>>>>>>> go to free
>>>>>>>> >>>>> job.
>>>>>>>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is 
>>>>>>>> faster. At
>>>>>>>> >>>>>>>>>>> that time, job is not freed, but s_fence is already 
>>>>>>>> NULL.
>>>>>>>> >>>>>>>>> No, that case can't happen. See here:
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>> >>>>>>>>>>
>>>>>>>> >>>>>>>>>> amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>> >>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>> >>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>> >>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>> >>>>>>>>>> kfree(job);
>>>>>>>> >>>>>>>>> The job itself is freed up directly after freeing the 
>>>>>>>> reference
>>>>>>>> >>>>>>>>> to the
>>>>>>>> >>>>> s_fence.
>>>>>>>> >>>>>>>>> So you are just papering over a much bigger problem 
>>>>>>>> here. This
>>>>>>>> >>>>>>>>> patch is a clear NAK.
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>> Regards,
>>>>>>>> >>>>>>>>> Christian.
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>>>>> When you see a job without an s_fence then that 
>>>>>>>> means the
>>>>>>>> >>>>>>>>>>>> problem is somewhere else.
>>>>>>>> >>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>> Regards,
>>>>>>>> >>>>>>>>>>>> Christian.
>>>>>>>> >>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng-5C7GfCeVMHo@public.gmane.org>
>>>>>>>> >>>>>>>>>>>>> ---
>>>>>>>> >>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>> >>>>>>>>>>>>> drivers/gpu/drm/scheduler/sched_main.c     | 11 
>>>>>>>> ++++++---
>>>>>>>> >--
>>>>>>>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 
>>>>>>>> deletions(-)
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> diff --git 
>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>> >>> amdgpu_device_gpu_recover(struct
>>>>>>>> >>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>> >>>>>>>>>>>>>            *
>>>>>>>> >>>>>>>>>>>>>            * job->base holds a reference to 
>>>>>>>> parent fence
>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>> >>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>> >>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>> >>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>> >>>>>>> &&
>>>>>>>> >>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>> >>>>>>>>>>>>> job_signaled = true;
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>> >drm_sched_increase_karma(struct
>>>>>>>> >>>>>>>>>>>> drm_sched_job
>>>>>>>> >>>>>>>>>>>>> *bad)
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe(entity,
>>>>>>>> >>>>>>>>>>>>> tmp,
>>>>>>>> >>> &rq-
>>>>>>>> >>>>>>>> entities,
>>>>>>>> >>>>>>>>>>>> list) {
>>>>>>>> >>>>>>>>>>>>> -                          if 
>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>> >>>>>>> ==
>>>>>>>> >>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>> >>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>> >>>>>>>>>>>>> scheduled.context ==
>>>>>>>> >>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>> >>>>>>>>>>>>> if
>>>>>>>> >>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>> >>>>>>>> karma) >
>>>>>>>> >>>>>>>>>>>>> bad->sched-
>>>>>>>> >>>> hang_limit)
>>>>>>>> >>>>>>>>>>>>> if
>>>>>>>> >>>>>>>>>>>>> (entity-
>>>>>>>> >>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>> >>>>>>> drm_gpu_scheduler
>>>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>> >>>>>>>>>>>>>            * This iteration is thread safe as 
>>>>>>>> sched thread
>>>>>>>> >>>>>>>>>>>>> is
>>>>>>>> >>> stopped.
>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>> >>>>>>>>>>>>> +          if (s_job->s_fence && 
>>>>>>>> s_job->s_fence->parent &&
>>>>>>>> >>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>> >>>> s_fence-
>>>>>>>> >>>>>>>> parent,
>>>>>>>> >>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>> >>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>> >>> @@ -
>>>>>>>> >>>>>>> 395,7
>>>>>>>> >>>>>>>>>>> +395,8 @@ void
>>>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>> >>>>>>>>>>>>> *
>>>>>>>> >>>>>>>>>>>>> * Job is still alive so fence
>>>>>>>> >>>>>>>>>>>>> refcount at
>>>>>>>> >>> least 1
>>>>>>>> >>>>>>>>>>>>> */
>>>>>>>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>> >>>>>>> false);
>>>>>>>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>>>>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>> >>>>>>>> finished,
>>>>>>>> >>>>>>>>>>>> false);
>>>>>>>> >>>>>>>>>>>>> /*
>>>>>>>> >>>>>>>>>>>>> * We must keep bad job alive
>>>>>>>> >>>>>>>>>>>>> for later
>>>>>>>> >>> use
>>>>>>>> >>>>>>> during @@
>>>>>>>> >>>>>>>>>>>> -438,7
>>>>>>>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct 
>>>>>>>> drm_gpu_scheduler
>>>>>>>> >>>>> *sched,
>>>>>>>> >>>>>>>>>>>>> +bool
>>>>>>>> >>>>>>>>>>>> full_recovery)
>>>>>>>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>> >>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>> >>>>>>>>>>>>> node)
>>>>>>>> >>>>>>>>>>>> {
>>>>>>>> >>>>>>>>>>>>> -          struct dma_fence *fence = 
>>>>>>>> s_job->s_fence->parent;
>>>>>>>> >>>>>>>>>>>>> +          struct dma_fence *fence = 
>>>>>>>> s_job->s_fence ?
>>>>>>>> >>>>>>>>>>>>> + s_job-
>>>>>>>> >>>>>>>> s_fence-
>>>>>>>> >>>>>>>>>>>>> parent :
>>>>>>>> >>>>>>>>>>>>> +NULL;
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>> _______________________________________________
>>>>>>>> >>>>>>>>>>> amd-gfx mailing list
>>>>>>>> >>>>>>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>>>> >>>>>>>>>>> 
>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>> >
>>>>>>>> >_______________________________________________
>>>>>>>> >amd-gfx mailing list
>>>>>>>> >amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>>>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>
>>>>>>
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>
>>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>

[-- Attachment #2: 0001-drm-scheduler-Avoid-accessing-freed-bad-job.patch --]
[-- Type: text/x-patch, Size: 3267 bytes --]

>From 8471644911cc7e300b08874ebb482d4e8c599904 Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky <andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
Date: Thu, 14 Nov 2019 16:04:49 -0500
Subject: drm/scheduler: Avoid accessing freed bad job.

Problem:
Due to a race between drm_sched_cleanup_jobs in sched thread and
drm_sched_job_timedout in timeout work there is a possiblity that
bad job was already freed while still being accessed from the
timeout thread.

Fix:
Instead of just peeking at the bad job in the mirror list
remove it from the list under lock and then put it back later when
we are garanteed no race with main sched thread is possible which
is after the thread is parked.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
---
 drivers/gpu/drm/scheduler/sched_main.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 80ddbdf..c2a6108 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -287,10 +287,24 @@ static void drm_sched_job_timedout(struct work_struct *work)
 	unsigned long flags;
 
 	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
+
+	/*
+	 * Protects against concurrent deletion in drm_sched_cleanup_jobs that
+	 * is already in progress.
+	 */
+	spin_lock_irqsave(&sched->job_list_lock, flags);
 	job = list_first_entry_or_null(&sched->ring_mirror_list,
 				       struct drm_sched_job, node);
 
 	if (job) {
+		/*
+		 * Remove the bad job so it cannot be freed by already in progress
+		 * drm_sched_cleanup_jobs. It will be reinsrted back after sched->thread
+		 * is parked at which point it's safe.
+		 */
+		list_del_init(&job->node);
+		spin_unlock_irqrestore(&sched->job_list_lock, flags);
+
 		job->sched->ops->timedout_job(job);
 
 		/*
@@ -302,6 +316,8 @@ static void drm_sched_job_timedout(struct work_struct *work)
 			sched->free_guilty = false;
 		}
 	}
+	else
+		spin_unlock_irqrestore(&sched->job_list_lock, flags);
 
 	spin_lock_irqsave(&sched->job_list_lock, flags);
 	drm_sched_start_timeout(sched);
@@ -373,6 +389,19 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 	kthread_park(sched->thread);
 
 	/*
+	 * Reinsert back the bad job here - now it's safe as drm_sched_cleanup_jobs
+	 * cannot race against us and release the bad job at this point - we parked
+	 * (waited for) any in progress (earlier) cleanups and any later ones will
+	 * bail out due to sched->thread being parked.
+	 */
+	if (bad && bad->sched == sched)
+		/*
+		 * Add at the head of the queue to reflect it was the earliest
+		 * job extracted.
+		 */
+		list_add(&bad->node, &sched->ring_mirror_list);
+
+	/*
 	 * Iterate the job list from later to  earlier one and either deactive
 	 * their HW callbacks or remove them from mirror list if they already
 	 * signaled.
@@ -657,7 +686,7 @@ static void drm_sched_cleanup_jobs(struct drm_gpu_scheduler *sched)
 		return;
 
 
-	while (!list_empty(&sched->ring_mirror_list)) {
+	while (!list_empty_careful(&sched->ring_mirror_list)) {
 		struct drm_sched_job *job;
 
 		job = list_first_entry(&sched->ring_mirror_list,
-- 
2.7.4


[-- Attachment #3: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-14 22:14                                                                                         ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-14 22:14 UTC (permalink / raw)
  To: christian.koenig, Deng, Emily, amd-gfx

[-- Attachment #1: Type: text/plain, Size: 29277 bytes --]

Attached.

Emily - can you give it a try ?

Andrey

On 11/14/19 3:12 AM, Christian König wrote:
>> What about instead of peeking at the job to actually remove it from 
>> ring_mirror_list right there,
> Also an interesting idea. We would need to protect the mirror list 
> with a lock again, but that should be the lesser evil.
>
> Maybe prototype that and see if it works or not.
>
> Regards,
> Christian.
>
> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>
>>
>> On 11/13/19 9:20 AM, Christian König wrote:
>>> Another more fundamental question: Could we get rid of the timeout 
>>> job at all?
>>
>>
>> There are other stuff there besides picking the first unfinished job 
>> which is common for all the drivers - such as freeing guilty signaled 
>> job and rearming the timeout work timer.
>>
>>
>>>
>>> I mean we used to give this as parameter to the scheduler callback 
>>> because we had the timeout worker in the job, but that is no longer 
>>> the case.
>>>
>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>         job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>                                        struct drm_sched_job, node);
>>>
>>> Why don't we just remove that here and only get the first job after 
>>> we have stopped the scheduler?
>>
>>
>> Should be ok since we have the extra check for __kthread_should_park 
>> in drm_sched_cleanup_jobs which will protect us in this case from a 
>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs 
>> after we already parked it. The problem here is we need the 
>> drm_sched_job to access the private data for each client driver (see 
>> amdgpu_job_timedout for example). What about instead of peeking at 
>> the job to actually remove it from ring_mirror_list right there, go 
>> ahead with it through the reset routine, if it's signaled in the 
>> meanwhile that great - release it, otherwise put it back into 
>> ring_mirror_list in drm_sched_resubmit_jobs.
>>
>> Andrey
>>
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>
>>>> This why I asked for a trace with timer enabled, but since there is 
>>>> a finite number of places we touch the timer Emily can just put 
>>>> prints there. Also, I wonder if this temp fix helps her with the 
>>>> issue or not.
>>>>
>>>> Andrey
>>>>
>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>> The question is where do we rearm the timer for this problem to 
>>>>> occur?
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>
>>>>>> I was able to reproduce the crash by using the attached 
>>>>>> simulate_crash.patch - waiting on guilty job to signal in reset 
>>>>>> work and artificially rearming the timeout timer just before the 
>>>>>> check for !cancel_delayed_work(&sched->work_tdr)  in 
>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I 
>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>
>>>>>> basic_fix.patch handles this by testing whether another timer 
>>>>>> already armed ob this scheduler or is there a timeout work in 
>>>>>> execution right now (see documentation for work_busy) - 
>>>>>> obviously  this is not a full solution as this will not protect 
>>>>>> from races if for example there is immediate work scheduling such 
>>>>>> as in drm_sched_fault -  so we probably need to account for this 
>>>>>> by making drm_sched_cleanup_jobs (at least in the part where it 
>>>>>> iterates ring mirror list and frees jobs) and GPU reset really 
>>>>>> mutually exclusive and not like now.
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>> Hi Emily,
>>>>>>>
>>>>>>> you need to print which scheduler instance is freeing the jobs 
>>>>>>> and which one is triggering the reset. The TID and PID is 
>>>>>>> completely meaningless here since we are called from different 
>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>
>>>>>>> Apart from that I will look into this a bit deeper when I have 
>>>>>>> time.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>> Hi Christian,
>>>>>>>>     I add the follow print in function drm_sched_cleanup_jobs. 
>>>>>>>> From the log it shows that only use cancel_delayed_work could 
>>>>>>>> not avoid to free job when the sched is in reset. But don’t 
>>>>>>>> know exactly where it is wrong about the driver. Do you have 
>>>>>>>> any suggestion about this?
>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu, 
>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>         /*
>>>>>>>>          * Don't destroy jobs while the timeout worker is 
>>>>>>>> running  OR thread
>>>>>>>>          * is being parked and hence assumed to not touch 
>>>>>>>> ring_mirror_list
>>>>>>>>          */
>>>>>>>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>                 return;
>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n", 
>>>>>>>> current->tgid, current->pid);
>>>>>>>> Best wishes
>>>>>>>> Emily Deng
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring 
>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* 
>>>>>>>> Process information: process pid 0 thread pid 0, 
>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information: 
>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253, 
>>>>>>>> pid:2253
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel: 
>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information: 
>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262, 
>>>>>>>> pid:2262
>>>>>>>> >-----Original Message-----
>>>>>>>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>> >Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>> >To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>>>> ><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue 
>>>>>>>> for tdr
>>>>>>>> >
>>>>>>>> >Thinking more about this claim - we assume here that if 
>>>>>>>> cancel_delayed_work
>>>>>>>> >returned true it guarantees that timeout work is not running 
>>>>>>>> but, it merely
>>>>>>>> >means there was a pending timeout work which was removed from the
>>>>>>>> >workqueue before it's timer elapsed and so it didn't have a 
>>>>>>>> chance to be
>>>>>>>> >dequeued and executed, it doesn't cover already executing 
>>>>>>>> work. So there is a
>>>>>>>> >possibility where while timeout work started executing another 
>>>>>>>> timeout work
>>>>>>>> >already got enqueued (maybe through earlier cleanup jobs or 
>>>>>>>> through
>>>>>>>> >drm_sched_fault) and if at this point another 
>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>> >cancel_delayed_work(&sched->work_tdr) will return true even 
>>>>>>>> while there is a
>>>>>>>> >timeout job in progress.
>>>>>>>> >Unfortunately we cannot change cancel_delayed_work to
>>>>>>>> >cancel_delayed_work_sync to flush the timeout work as timeout 
>>>>>>>> work itself
>>>>>>>> >waits for schedule thread  to be parked again when calling 
>>>>>>>> park_thread.
>>>>>>>> >
>>>>>>>> >Andrey
>>>>>>>> >
>>>>>>>> >________________________________________
>>>>>>>> >From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on 
>>>>>>>> behalf of
>>>>>>>> >Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> >Sent: 08 November 2019 05:35:18
>>>>>>>> >To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue 
>>>>>>>> for tdr
>>>>>>>> >
>>>>>>>> >Hi Emily,
>>>>>>>> >
>>>>>>>> >exactly that can't happen. See here:
>>>>>>>> >
>>>>>>>> >>         /* Don't destroy jobs while the timeout worker is 
>>>>>>>> running */
>>>>>>>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>> >> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>> >>                 return NULL;
>>>>>>>> >
>>>>>>>> >We never free jobs while the timeout working is running to 
>>>>>>>> prevent exactly
>>>>>>>> >that issue.
>>>>>>>> >
>>>>>>>> >Regards,
>>>>>>>> >Christian.
>>>>>>>> >
>>>>>>>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>> >> Hi Christian,
>>>>>>>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>>>>>> >amdgpu_device_gpu_recover. I mean the main scheduler free the 
>>>>>>>> jobs while
>>>>>>>> >in amdgpu_device_gpu_recover, and before calling drm_sched_stop.
>>>>>>>> >>
>>>>>>>> >> Best wishes
>>>>>>>> >> Emily Deng
>>>>>>>> >>
>>>>>>>> >>
>>>>>>>> >>
>>>>>>>> >>> -----Original Message-----
>>>>>>>> >>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> >>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>> >>> To: Deng, Emily <Emily.Deng@amd.com>; 
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue 
>>>>>>>> for tdr
>>>>>>>> >>>
>>>>>>>> >>> Hi Emily,
>>>>>>>> >>>
>>>>>>>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>>>> >>>
>>>>>>>> >>> When it's not the scheduler we shouldn't have a guilty job 
>>>>>>>> in the first place.
>>>>>>>> >>>
>>>>>>>> >>> Regards,
>>>>>>>> >>> Christian.
>>>>>>>> >>>
>>>>>>>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>> >>>> Hi Chrisitan,
>>>>>>>> >>>>        No, I am with the new branch and also has the 
>>>>>>>> patch. Even it
>>>>>>>> >>>> are freed by
>>>>>>>> >>> main scheduler, how we could avoid main scheduler to free 
>>>>>>>> jobs while
>>>>>>>> >>> enter to function amdgpu_device_gpu_recover?
>>>>>>>> >>>> Best wishes
>>>>>>>> >>>> Emily Deng
>>>>>>>> >>>>
>>>>>>>> >>>>
>>>>>>>> >>>>
>>>>>>>> >>>>> -----Original Message-----
>>>>>>>> >>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>> >>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>> >gfx@lists.freedesktop.org
>>>>>>>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>> issue for tdr
>>>>>>>> >>>>>
>>>>>>>> >>>>> Hi Emily,
>>>>>>>> >>>>>
>>>>>>>> >>>>> in this case you are on an old code branch.
>>>>>>>> >>>>>
>>>>>>>> >>>>> Jobs are freed now by the main scheduler thread and only 
>>>>>>>> if no
>>>>>>>> >>>>> timeout handler is running.
>>>>>>>> >>>>>
>>>>>>>> >>>>> See this patch here:
>>>>>>>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>> >>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>> >>>>>>
>>>>>>>> >>>>>>       drm/scheduler: rework job destruction
>>>>>>>> >>>>> Regards,
>>>>>>>> >>>>> Christian.
>>>>>>>> >>>>>
>>>>>>>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>> >>>>>> Hi Christian,
>>>>>>>> >>>>>>         Please refer to follow log, when it enter to
>>>>>>>> >>>>>> amdgpu_device_gpu_recover
>>>>>>>> >>>>> function, the bad job 000000005086879e is freeing in 
>>>>>>>> function
>>>>>>>> >>>>> amdgpu_job_free_cb  at the same time, because of the 
>>>>>>>> hardware fence
>>>>>>>> >>> signal.
>>>>>>>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case, the
>>>>>>>> >>>>> s_fence is already freed, but job is not freed in time. 
>>>>>>>> Then this issue
>>>>>>>> >occurs.
>>>>>>>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]] 
>>>>>>>> *ERROR* ring
>>>>>>>> >>> sdma0
>>>>>>>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [  449.793202]
>>>>>>>> >>>>>> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process 
>>>>>>>> information:
>>>>>>>> >>>>> process  pid 0 thread pid 0, s_job:000000005086879e [  
>>>>>>>> 449.794163]
>>>>>>>> >>>>> amdgpu
>>>>>>>> >>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process 
>>>>>>>> information:
>>>>>>>> >>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [ 
>>>>>>>> 449.794221]
>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  
>>>>>>>> pid 0
>>>>>>>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  
>>>>>>>> pid 0
>>>>>>>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  
>>>>>>>> pid 0
>>>>>>>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process  
>>>>>>>> pid 0
>>>>>>>> >>>>>> thread pid 0,
>>>>>>>> >>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>> >>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>> >>>>> information: process pid 0 thread  pid 0, 
>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer 
>>>>>>>> dereference
>>>>>>>> >>>>> at
>>>>>>>> >>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0 [  449.801040] 
>>>>>>>> Oops:
>>>>>>>> >>>>> 0000 [#1] SMP PTI
>>>>>>>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted: 
>>>>>>>> G OE
>>>>>>>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX + 
>>>>>>>> PIIX,
>>>>>>>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [  449.802944]
>>>>>>>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched] [
>>>>>>>> >>>>>> 449.803488]
>>>>>>>> >>> RIP:
>>>>>>>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85 
>>>>>>>> 56 ff ff
>>>>>>>> >>>>>> ff
>>>>>>>> >>>>>> 45 85 e4 0f
>>>>>>>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48 
>>>>>>>> 8b 40 10
>>>>>>>> >>>>> <48> 8b
>>>>>>>> >>> 98
>>>>>>>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43 
>>>>>>>> 48 a8 01
>>>>>>>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS: 
>>>>>>>> 00010286 [
>>>>>>>> >>>>>> 449.806032] RAX: 0000000000000000 RBX: 0000000000000000 
>>>>>>>> RCX:
>>>>>>>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0 RSI:
>>>>>>>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224] RBP:
>>>>>>>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09: 
>>>>>>>> 0000000000000000 [
>>>>>>>> >>>>>> 449.807818] R10: 0000000000000000 R11: 0000000000000148 
>>>>>>>> R12:
>>>>>>>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0 R14:
>>>>>>>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>> >>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000 
>>>>>>>> ES: 0000
>>>>>>>> >CR0:
>>>>>>>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0 CR3:
>>>>>>>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747] DR0:
>>>>>>>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2: 
>>>>>>>> 0000000000000000 [
>>>>>>>> >>>>> 449.811344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>>>>>>>> >>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140 [amdgpu] [
>>>>>>>> >>>>>> 449.812635] drm_sched_job_timedout+0x44/0x90 [amd_sched] [
>>>>>>>> >>>>>> 449.813139]  ? amdgpu_cgs_destroy_device+0x10/0x10 
>>>>>>>> [amdgpu] [
>>>>>>>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90 
>>>>>>>> [amd_sched] [
>>>>>>>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>>> >>>>>> worker_thread+0x34/0x410 [ 449.814728] 
>>>>>>>> kthread+0x121/0x140 [
>>>>>>>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [  
>>>>>>>> 449.815374]  ?
>>>>>>>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>> >>>>>>
>>>>>>>> >>>>>>> -----Original Message-----
>>>>>>>> >>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>> >>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>> >>> gfx@lists.freedesktop.org
>>>>>>>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>> issue for
>>>>>>>> >>>>>>> tdr
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>> >>>>>>>> Sorry, please take your time.
>>>>>>>> >>>>>>> Have you seen my other response a bit below?
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>> I can't follow how it would be possible for 
>>>>>>>> job->s_fence to be
>>>>>>>> >>>>>>> NULL without the job also being freed.
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>> So it looks like this patch is just papering over some 
>>>>>>>> bigger issues.
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>> Regards,
>>>>>>>> >>>>>>> Christian.
>>>>>>>> >>>>>>>
>>>>>>>> >>>>>>>> Best wishes
>>>>>>>> >>>>>>>> Emily Deng
>>>>>>>> >>>>>>>>
>>>>>>>> >>>>>>>>
>>>>>>>> >>>>>>>>
>>>>>>>> >>>>>>>>> -----Original Message-----
>>>>>>>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>> >>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>> >>>>> gfx@lists.freedesktop.org
>>>>>>>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>> issue for
>>>>>>>> >>>>>>>>> tdr
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>> >>>>>>>>>> Ping.....
>>>>>>>> >>>>>>>>> You need to give me at least enough time to wake up :)
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>>> Best wishes
>>>>>>>> >>>>>>>>>> Emily Deng
>>>>>>>> >>>>>>>>>>
>>>>>>>> >>>>>>>>>>
>>>>>>>> >>>>>>>>>>
>>>>>>>> >>>>>>>>>>> -----Original Message-----
>>>>>>>> >>>>>>>>>>> From: amd-gfx 
>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>> >>> Behalf
>>>>>>>> >>>>>>>>>>> Of Deng, Emily
>>>>>>>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>>>>>>>> >>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>> pointer issue
>>>>>>>> >>>>>>>>>>> for tdr
>>>>>>>> >>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>> -----Original Message-----
>>>>>>>> >>>>>>>>>>>> From: Christian König 
>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>> >>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>> pointer issue
>>>>>>>> >>>>>>>>>>>> for tdr
>>>>>>>> >>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is 
>>>>>>>> freed.
>>>>>>>> >>>>>>>>>>>>> Then it will has null pointer in 
>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job 
>>>>>>>> is destroyed.
>>>>>>>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup. 
>>>>>>>> But in one
>>>>>>>> >>>>>>>>>>> case, when it enter into the 
>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time, 
>>>>>>>> it will
>>>>>>>> >>>>>>>>>>> go to free
>>>>>>>> >>>>> job.
>>>>>>>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is 
>>>>>>>> faster. At
>>>>>>>> >>>>>>>>>>> that time, job is not freed, but s_fence is already 
>>>>>>>> NULL.
>>>>>>>> >>>>>>>>> No, that case can't happen. See here:
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>> >>>>>>>>>>
>>>>>>>> >>>>>>>>>> amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>> >>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>> >>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>> >>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>> >>>>>>>>>> kfree(job);
>>>>>>>> >>>>>>>>> The job itself is freed up directly after freeing the 
>>>>>>>> reference
>>>>>>>> >>>>>>>>> to the
>>>>>>>> >>>>> s_fence.
>>>>>>>> >>>>>>>>> So you are just papering over a much bigger problem 
>>>>>>>> here. This
>>>>>>>> >>>>>>>>> patch is a clear NAK.
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>> Regards,
>>>>>>>> >>>>>>>>> Christian.
>>>>>>>> >>>>>>>>>
>>>>>>>> >>>>>>>>>>>> When you see a job without an s_fence then that 
>>>>>>>> means the
>>>>>>>> >>>>>>>>>>>> problem is somewhere else.
>>>>>>>> >>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>> Regards,
>>>>>>>> >>>>>>>>>>>> Christian.
>>>>>>>> >>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>> >>>>>>>>>>>>> ---
>>>>>>>> >>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>> >>>>>>>>>>>>> drivers/gpu/drm/scheduler/sched_main.c     | 11 
>>>>>>>> ++++++---
>>>>>>>> >--
>>>>>>>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6 
>>>>>>>> deletions(-)
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> diff --git 
>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>> >>> amdgpu_device_gpu_recover(struct
>>>>>>>> >>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>> >>>>>>>>>>>>>            *
>>>>>>>> >>>>>>>>>>>>>            * job->base holds a reference to 
>>>>>>>> parent fence
>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>> >>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>> >>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>> >>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>> >>>>>>> &&
>>>>>>>> >>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>>>> >>>>>>>>>>>>> job_signaled = true;
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>> >drm_sched_increase_karma(struct
>>>>>>>> >>>>>>>>>>>> drm_sched_job
>>>>>>>> >>>>>>>>>>>>> *bad)
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe(entity,
>>>>>>>> >>>>>>>>>>>>> tmp,
>>>>>>>> >>> &rq-
>>>>>>>> >>>>>>>> entities,
>>>>>>>> >>>>>>>>>>>> list) {
>>>>>>>> >>>>>>>>>>>>> -                          if 
>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>> >>>>>>> ==
>>>>>>>> >>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>> >>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>> >>>>>>>>>>>>> scheduled.context ==
>>>>>>>> >>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>> >>>>>>>>>>>>> if
>>>>>>>> >>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>> >>>>>>>> karma) >
>>>>>>>> >>>>>>>>>>>>> bad->sched-
>>>>>>>> >>>> hang_limit)
>>>>>>>> >>>>>>>>>>>>> if
>>>>>>>> >>>>>>>>>>>>> (entity-
>>>>>>>> >>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>> >>>>>>> drm_gpu_scheduler
>>>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>> >>>>>>>>>>>>>            * This iteration is thread safe as 
>>>>>>>> sched thread
>>>>>>>> >>>>>>>>>>>>> is
>>>>>>>> >>> stopped.
>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>> >>>>>>>>>>>>> +          if (s_job->s_fence && 
>>>>>>>> s_job->s_fence->parent &&
>>>>>>>> >>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>> >>>> s_fence-
>>>>>>>> >>>>>>>> parent,
>>>>>>>> >>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>> >>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>> >>> @@ -
>>>>>>>> >>>>>>> 395,7
>>>>>>>> >>>>>>>>>>> +395,8 @@ void
>>>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>> >>>>>>>>>>>>> *
>>>>>>>> >>>>>>>>>>>>> * Job is still alive so fence
>>>>>>>> >>>>>>>>>>>>> refcount at
>>>>>>>> >>> least 1
>>>>>>>> >>>>>>>>>>>>> */
>>>>>>>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>> >>>>>>> false);
>>>>>>>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>>>>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>> >>>>>>>> finished,
>>>>>>>> >>>>>>>>>>>> false);
>>>>>>>> >>>>>>>>>>>>> /*
>>>>>>>> >>>>>>>>>>>>> * We must keep bad job alive
>>>>>>>> >>>>>>>>>>>>> for later
>>>>>>>> >>> use
>>>>>>>> >>>>>>> during @@
>>>>>>>> >>>>>>>>>>>> -438,7
>>>>>>>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct 
>>>>>>>> drm_gpu_scheduler
>>>>>>>> >>>>> *sched,
>>>>>>>> >>>>>>>>>>>>> +bool
>>>>>>>> >>>>>>>>>>>> full_recovery)
>>>>>>>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>> >>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>> >>>>>>>>>>>>> node)
>>>>>>>> >>>>>>>>>>>> {
>>>>>>>> >>>>>>>>>>>>> -          struct dma_fence *fence = 
>>>>>>>> s_job->s_fence->parent;
>>>>>>>> >>>>>>>>>>>>> +          struct dma_fence *fence = 
>>>>>>>> s_job->s_fence ?
>>>>>>>> >>>>>>>>>>>>> + s_job-
>>>>>>>> >>>>>>>> s_fence-
>>>>>>>> >>>>>>>>>>>>> parent :
>>>>>>>> >>>>>>>>>>>>> +NULL;
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>> >>>>>>>>>>>>>
>>>>>>>> >>>>>>>>>>> _______________________________________________
>>>>>>>> >>>>>>>>>>> amd-gfx mailing list
>>>>>>>> >>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> >>>>>>>>>>> 
>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx 
>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>> >
>>>>>>>> >_______________________________________________
>>>>>>>> >amd-gfx mailing list
>>>>>>>> >amd-gfx@lists.freedesktop.org
>>>>>>>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>
>>>>>>
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>
>>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>

[-- Attachment #2: 0001-drm-scheduler-Avoid-accessing-freed-bad-job.patch --]
[-- Type: text/x-patch, Size: 3224 bytes --]

From 8471644911cc7e300b08874ebb482d4e8c599904 Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Date: Thu, 14 Nov 2019 16:04:49 -0500
Subject: drm/scheduler: Avoid accessing freed bad job.

Problem:
Due to a race between drm_sched_cleanup_jobs in sched thread and
drm_sched_job_timedout in timeout work there is a possiblity that
bad job was already freed while still being accessed from the
timeout thread.

Fix:
Instead of just peeking at the bad job in the mirror list
remove it from the list under lock and then put it back later when
we are garanteed no race with main sched thread is possible which
is after the thread is parked.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/scheduler/sched_main.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 80ddbdf..c2a6108 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -287,10 +287,24 @@ static void drm_sched_job_timedout(struct work_struct *work)
 	unsigned long flags;
 
 	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
+
+	/*
+	 * Protects against concurrent deletion in drm_sched_cleanup_jobs that
+	 * is already in progress.
+	 */
+	spin_lock_irqsave(&sched->job_list_lock, flags);
 	job = list_first_entry_or_null(&sched->ring_mirror_list,
 				       struct drm_sched_job, node);
 
 	if (job) {
+		/*
+		 * Remove the bad job so it cannot be freed by already in progress
+		 * drm_sched_cleanup_jobs. It will be reinsrted back after sched->thread
+		 * is parked at which point it's safe.
+		 */
+		list_del_init(&job->node);
+		spin_unlock_irqrestore(&sched->job_list_lock, flags);
+
 		job->sched->ops->timedout_job(job);
 
 		/*
@@ -302,6 +316,8 @@ static void drm_sched_job_timedout(struct work_struct *work)
 			sched->free_guilty = false;
 		}
 	}
+	else
+		spin_unlock_irqrestore(&sched->job_list_lock, flags);
 
 	spin_lock_irqsave(&sched->job_list_lock, flags);
 	drm_sched_start_timeout(sched);
@@ -373,6 +389,19 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 	kthread_park(sched->thread);
 
 	/*
+	 * Reinsert back the bad job here - now it's safe as drm_sched_cleanup_jobs
+	 * cannot race against us and release the bad job at this point - we parked
+	 * (waited for) any in progress (earlier) cleanups and any later ones will
+	 * bail out due to sched->thread being parked.
+	 */
+	if (bad && bad->sched == sched)
+		/*
+		 * Add at the head of the queue to reflect it was the earliest
+		 * job extracted.
+		 */
+		list_add(&bad->node, &sched->ring_mirror_list);
+
+	/*
 	 * Iterate the job list from later to  earlier one and either deactive
 	 * their HW callbacks or remove them from mirror list if they already
 	 * signaled.
@@ -657,7 +686,7 @@ static void drm_sched_cleanup_jobs(struct drm_gpu_scheduler *sched)
 		return;
 
 
-	while (!list_empty(&sched->ring_mirror_list)) {
+	while (!list_empty_careful(&sched->ring_mirror_list)) {
 		struct drm_sched_job *job;
 
 		job = list_first_entry(&sched->ring_mirror_list,
-- 
2.7.4


[-- Attachment #3: Type: text/plain, Size: 153 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-15  4:39                                                                                             ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-15  4:39 UTC (permalink / raw)
  To: Grodzovsky, Andrey, Koenig, Christian,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Andrey,
     Currently, I am busying with another issue, maybe will try next week.

Best wishes
Emily Deng



>-----Original Message-----
>From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>Sent: Friday, November 15, 2019 6:14 AM
>To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Attached.
>
>Emily - can you give it a try ?
>
>Andrey
>
>On 11/14/19 3:12 AM, Christian König wrote:
>>> What about instead of peeking at the job to actually remove it from
>>> ring_mirror_list right there,
>> Also an interesting idea. We would need to protect the mirror list
>> with a lock again, but that should be the lesser evil.
>>
>> Maybe prototype that and see if it works or not.
>>
>> Regards,
>> Christian.
>>
>> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>>
>>>
>>> On 11/13/19 9:20 AM, Christian König wrote:
>>>> Another more fundamental question: Could we get rid of the timeout
>>>> job at all?
>>>
>>>
>>> There are other stuff there besides picking the first unfinished job
>>> which is common for all the drivers - such as freeing guilty signaled
>>> job and rearming the timeout work timer.
>>>
>>>
>>>>
>>>> I mean we used to give this as parameter to the scheduler callback
>>>> because we had the timeout worker in the job, but that is no longer
>>>> the case.
>>>>
>>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>>         job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>                                        struct drm_sched_job, node);
>>>>
>>>> Why don't we just remove that here and only get the first job after
>>>> we have stopped the scheduler?
>>>
>>>
>>> Should be ok since we have the extra check for __kthread_should_park
>>> in drm_sched_cleanup_jobs which will protect us in this case from a
>>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs
>>> after we already parked it. The problem here is we need the
>>> drm_sched_job to access the private data for each client driver (see
>>> amdgpu_job_timedout for example). What about instead of peeking at
>>> the job to actually remove it from ring_mirror_list right there, go
>>> ahead with it through the reset routine, if it's signaled in the
>>> meanwhile that great - release it, otherwise put it back into
>>> ring_mirror_list in drm_sched_resubmit_jobs.
>>>
>>> Andrey
>>>
>>>
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>>
>>>>> This why I asked for a trace with timer enabled, but since there is
>>>>> a finite number of places we touch the timer Emily can just put
>>>>> prints there. Also, I wonder if this temp fix helps her with the
>>>>> issue or not.
>>>>>
>>>>> Andrey
>>>>>
>>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>>> The question is where do we rearm the timer for this problem to
>>>>>> occur?
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>>
>>>>>>> I was able to reproduce the crash by using the attached
>>>>>>> simulate_crash.patch - waiting on guilty job to signal in reset
>>>>>>> work and artificially rearming the timeout timer just before the
>>>>>>> check for !cancel_delayed_work(&sched->work_tdr)  in
>>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I
>>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>>
>>>>>>> basic_fix.patch handles this by testing whether another timer
>>>>>>> already armed ob this scheduler or is there a timeout work in
>>>>>>> execution right now (see documentation for work_busy) - obviously
>>>>>>> this is not a full solution as this will not protect from races
>>>>>>> if for example there is immediate work scheduling such as in
>>>>>>> drm_sched_fault -  so we probably need to account for this by
>>>>>>> making drm_sched_cleanup_jobs (at least in the part where it
>>>>>>> iterates ring mirror list and frees jobs) and GPU reset really
>>>>>>> mutually exclusive and not like now.
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>>> Hi Emily,
>>>>>>>>
>>>>>>>> you need to print which scheduler instance is freeing the jobs
>>>>>>>> and which one is triggering the reset. The TID and PID is
>>>>>>>> completely meaningless here since we are called from different
>>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>>
>>>>>>>> Apart from that I will look into this a bit deeper when I have
>>>>>>>> time.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>>> Hi Christian,
>>>>>>>>>     I add the follow print in function drm_sched_cleanup_jobs.
>>>>>>>>> From the log it shows that only use cancel_delayed_work could
>>>>>>>>> not avoid to free job when the sched is in reset. But don’t
>>>>>>>>> know exactly where it is wrong about the driver. Do you have
>>>>>>>>> any suggestion about this?
>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu,
>>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>>         /*
>>>>>>>>>          * Don't destroy jobs while the timeout worker is
>>>>>>>>> running  OR thread
>>>>>>>>>          * is being parked and hence assumed to not touch
>>>>>>>>> ring_mirror_list
>>>>>>>>>          */
>>>>>>>>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>>                 return;
>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n",
>>>>>>>>> current->tgid, current->pid);
>>>>>>>>> Best wishes
>>>>>>>>> Emily Deng
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>ring
>>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587 Nov 12
>>>>>>>>> 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>>> Process information: process pid 0 thread pid 0,
>>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603 Nov 12 12:58:20
>>>>>>>>> ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253,
>>>>>>>>> pid:2253
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> >-----Original Message-----
>>>>>>>>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>>> >Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>>> >To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>>>>> ><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>> for tdr
>>>>>>>>> >
>>>>>>>>> >Thinking more about this claim - we assume here that if
>>>>>>>>> cancel_delayed_work
>>>>>>>>> >returned true it guarantees that timeout work is not running
>>>>>>>>> but, it merely
>>>>>>>>> >means there was a pending timeout work which was removed
>from
>>>>>>>>> >the workqueue before it's timer elapsed and so it didn't have
>>>>>>>>> >a
>>>>>>>>> chance to be
>>>>>>>>> >dequeued and executed, it doesn't cover already executing
>>>>>>>>> work. So there is a
>>>>>>>>> >possibility where while timeout work started executing another
>>>>>>>>> timeout work
>>>>>>>>> >already got enqueued (maybe through earlier cleanup jobs or
>>>>>>>>> through
>>>>>>>>> >drm_sched_fault) and if at this point another
>>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>>> >cancel_delayed_work(&sched->work_tdr) will return true even
>>>>>>>>> while there is a
>>>>>>>>> >timeout job in progress.
>>>>>>>>> >Unfortunately we cannot change cancel_delayed_work to
>>>>>>>>> >cancel_delayed_work_sync to flush the timeout work as timeout
>>>>>>>>> work itself
>>>>>>>>> >waits for schedule thread  to be parked again when calling
>>>>>>>>> park_thread.
>>>>>>>>> >
>>>>>>>>> >Andrey
>>>>>>>>> >
>>>>>>>>> >________________________________________
>>>>>>>>> >From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on
>>>>>>>>> behalf of
>>>>>>>>> >Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> >Sent: 08 November 2019 05:35:18
>>>>>>>>> >To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>> for tdr
>>>>>>>>> >
>>>>>>>>> >Hi Emily,
>>>>>>>>> >
>>>>>>>>> >exactly that can't happen. See here:
>>>>>>>>> >
>>>>>>>>> >>         /* Don't destroy jobs while the timeout worker is
>>>>>>>>> running */
>>>>>>>>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>> >> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>>> >>                 return NULL;
>>>>>>>>> >
>>>>>>>>> >We never free jobs while the timeout working is running to
>>>>>>>>> prevent exactly
>>>>>>>>> >that issue.
>>>>>>>>> >
>>>>>>>>> >Regards,
>>>>>>>>> >Christian.
>>>>>>>>> >
>>>>>>>>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>>> >> Hi Christian,
>>>>>>>>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>>>>>>> >amdgpu_device_gpu_recover. I mean the main scheduler free the
>>>>>>>>> jobs while
>>>>>>>>> >in amdgpu_device_gpu_recover, and before calling
>drm_sched_stop.
>>>>>>>>> >>
>>>>>>>>> >> Best wishes
>>>>>>>>> >> Emily Deng
>>>>>>>>> >>
>>>>>>>>> >>
>>>>>>>>> >>
>>>>>>>>> >>> -----Original Message-----
>>>>>>>>> >>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> >>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>>> >>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>> for tdr
>>>>>>>>> >>>
>>>>>>>>> >>> Hi Emily,
>>>>>>>>> >>>
>>>>>>>>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>>>>> >>>
>>>>>>>>> >>> When it's not the scheduler we shouldn't have a guilty job
>>>>>>>>> in the first place.
>>>>>>>>> >>>
>>>>>>>>> >>> Regards,
>>>>>>>>> >>> Christian.
>>>>>>>>> >>>
>>>>>>>>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>>> >>>> Hi Chrisitan,
>>>>>>>>> >>>>        No, I am with the new branch and also has the
>>>>>>>>> patch. Even it
>>>>>>>>> >>>> are freed by
>>>>>>>>> >>> main scheduler, how we could avoid main scheduler to free
>>>>>>>>> jobs while
>>>>>>>>> >>> enter to function amdgpu_device_gpu_recover?
>>>>>>>>> >>>> Best wishes
>>>>>>>>> >>>> Emily Deng
>>>>>>>>> >>>>
>>>>>>>>> >>>>
>>>>>>>>> >>>>
>>>>>>>>> >>>>> -----Original Message-----
>>>>>>>>> >>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>> >>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>> >gfx@lists.freedesktop.org
>>>>>>>>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>> issue for tdr
>>>>>>>>> >>>>>
>>>>>>>>> >>>>> Hi Emily,
>>>>>>>>> >>>>>
>>>>>>>>> >>>>> in this case you are on an old code branch.
>>>>>>>>> >>>>>
>>>>>>>>> >>>>> Jobs are freed now by the main scheduler thread and only
>>>>>>>>> if no
>>>>>>>>> >>>>> timeout handler is running.
>>>>>>>>> >>>>>
>>>>>>>>> >>>>> See this patch here:
>>>>>>>>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>> >>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>> >>>>>>
>>>>>>>>> >>>>>>       drm/scheduler: rework job destruction
>>>>>>>>> >>>>> Regards,
>>>>>>>>> >>>>> Christian.
>>>>>>>>> >>>>>
>>>>>>>>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>> >>>>>> Hi Christian,
>>>>>>>>> >>>>>>         Please refer to follow log, when it enter to
>>>>>>>>> >>>>>>amdgpu_device_gpu_recover
>>>>>>>>> >>>>> function, the bad job 000000005086879e is freeing in
>>>>>>>>> function
>>>>>>>>> >>>>> amdgpu_job_free_cb  at the same time, because of the
>>>>>>>>> hardware fence
>>>>>>>>> >>> signal.
>>>>>>>>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case,
>>>>>>>>> >>>>> the s_fence is already freed, but job is not freed in time.
>>>>>>>>> Then this issue
>>>>>>>>> >occurs.
>>>>>>>>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>>> *ERROR* ring
>>>>>>>>> >>> sdma0
>>>>>>>>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [
>>>>>>>>> >>>>>> 449.793202] [drm:amdgpu_job_timedout [amdgpu]]
>*ERROR*
>>>>>>>>> >>>>>> Process
>>>>>>>>> information:
>>>>>>>>> >>>>> process  pid 0 thread pid 0, s_job:000000005086879e [
>>>>>>>>> 449.794163]
>>>>>>>>> >>>>> amdgpu
>>>>>>>>> >>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process
>>>>>>>>> information:
>>>>>>>>> >>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>> 449.794221]
>>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>> pid 0
>>>>>>>>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>> pid 0
>>>>>>>>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>> pid 0
>>>>>>>>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>> pid 0
>>>>>>>>> >>>>>> thread pid 0,
>>>>>>>>> >>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>>> >>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>> >>>>> information: process pid 0 thread  pid 0,
>>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>>> dereference
>>>>>>>>> >>>>> at
>>>>>>>>> >>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0
>[  449.801040]
>>>>>>>>> Oops:
>>>>>>>>> >>>>> 0000 [#1] SMP PTI
>>>>>>>>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted:
>>>>>>>>> G OE
>>>>>>>>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX
>+
>>>>>>>>> PIIX,
>>>>>>>>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [
>>>>>>>>> >>>>>> 449.802944]
>>>>>>>>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched]
>[
>>>>>>>>> >>>>>> 449.803488]
>>>>>>>>> >>> RIP:
>>>>>>>>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85
>>>>>>>>> 56 ff ff
>>>>>>>>> >>>>>> ff
>>>>>>>>> >>>>>> 45 85 e4 0f
>>>>>>>>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48
>>>>>>>>> 8b 40 10
>>>>>>>>> >>>>> <48> 8b
>>>>>>>>> >>> 98
>>>>>>>>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43
>>>>>>>>> 48 a8 01
>>>>>>>>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS:
>>>>>>>>> 00010286 [
>>>>>>>>> >>>>>> 449.806032] RAX: 0000000000000000 RBX:
>0000000000000000
>>>>>>>>> RCX:
>>>>>>>>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0
>RSI:
>>>>>>>>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224]
>RBP:
>>>>>>>>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>>>>>>>>> 0000000000000000 [
>>>>>>>>> >>>>>> 449.807818] R10: 0000000000000000 R11:
>0000000000000148
>>>>>>>>> R12:
>>>>>>>>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0
>R14:
>>>>>>>>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>>>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>> >>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000
>>>>>>>>> ES: 0000
>>>>>>>>> >CR0:
>>>>>>>>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0
>CR3:
>>>>>>>>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747]
>DR0:
>>>>>>>>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>>>>>>>> 0000000000000000 [
>>>>>>>>> >>>>> 449.811344] DR3: 0000000000000000 DR6:
>00000000fffe0ff0 DR7:
>>>>>>>>> >>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140
>[amdgpu]
>>>>>>>>> >>>>>> [ 449.812635] drm_sched_job_timedout+0x44/0x90
>>>>>>>>> >>>>>> [amd_sched] [ 449.813139]  ?
>>>>>>>>> >>>>>> amdgpu_cgs_destroy_device+0x10/0x10
>>>>>>>>> [amdgpu] [
>>>>>>>>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90
>>>>>>>>> [amd_sched] [
>>>>>>>>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>>>> >>>>>> worker_thread+0x34/0x410 [ 449.814728]
>>>>>>>>> kthread+0x121/0x140 [
>>>>>>>>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [
>>>>>>>>> 449.815374]  ?
>>>>>>>>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>>> >>>>>>
>>>>>>>>> >>>>>>> -----Original Message-----
>>>>>>>>> >>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>> >>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>> >>> gfx@lists.freedesktop.org
>>>>>>>>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>> issue for
>>>>>>>>> >>>>>>> tdr
>>>>>>>>> >>>>>>>
>>>>>>>>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>> >>>>>>>> Sorry, please take your time.
>>>>>>>>> >>>>>>> Have you seen my other response a bit below?
>>>>>>>>> >>>>>>>
>>>>>>>>> >>>>>>> I can't follow how it would be possible for
>>>>>>>>> job->s_fence to be
>>>>>>>>> >>>>>>> NULL without the job also being freed.
>>>>>>>>> >>>>>>>
>>>>>>>>> >>>>>>> So it looks like this patch is just papering over some
>>>>>>>>> bigger issues.
>>>>>>>>> >>>>>>>
>>>>>>>>> >>>>>>> Regards,
>>>>>>>>> >>>>>>> Christian.
>>>>>>>>> >>>>>>>
>>>>>>>>> >>>>>>>> Best wishes
>>>>>>>>> >>>>>>>> Emily Deng
>>>>>>>>> >>>>>>>>
>>>>>>>>> >>>>>>>>
>>>>>>>>> >>>>>>>>
>>>>>>>>> >>>>>>>>> -----Original Message-----
>>>>>>>>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>> >>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>> >>>>> gfx@lists.freedesktop.org
>>>>>>>>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>> issue for
>>>>>>>>> >>>>>>>>> tdr
>>>>>>>>> >>>>>>>>>
>>>>>>>>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>> >>>>>>>>>> Ping.....
>>>>>>>>> >>>>>>>>> You need to give me at least enough time to wake up
>>>>>>>>> >>>>>>>>> :)
>>>>>>>>> >>>>>>>>>
>>>>>>>>> >>>>>>>>>> Best wishes
>>>>>>>>> >>>>>>>>>> Emily Deng
>>>>>>>>> >>>>>>>>>>
>>>>>>>>> >>>>>>>>>>
>>>>>>>>> >>>>>>>>>>
>>>>>>>>> >>>>>>>>>>> -----Original Message-----
>>>>>>>>> >>>>>>>>>>> From: amd-gfx
>>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>>> >>> Behalf
>>>>>>>>> >>>>>>>>>>> Of Deng, Emily
>>>>>>>>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>;
>>>>>>>>> >>>>>>>>>>> amd- gfx@lists.freedesktop.org
>>>>>>>>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>> pointer issue
>>>>>>>>> >>>>>>>>>>> for tdr
>>>>>>>>> >>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>> -----Original Message-----
>>>>>>>>> >>>>>>>>>>>> From: Christian König
>>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>> >>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>> pointer issue
>>>>>>>>> >>>>>>>>>>>> for tdr
>>>>>>>>> >>>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is
>>>>>>>>> freed.
>>>>>>>>> >>>>>>>>>>>>> Then it will has null pointer in
>>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job
>>>>>>>>> is destroyed.
>>>>>>>>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup.
>>>>>>>>> But in one
>>>>>>>>> >>>>>>>>>>> case, when it enter into the
>>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time,
>>>>>>>>> it will
>>>>>>>>> >>>>>>>>>>> go to free
>>>>>>>>> >>>>> job.
>>>>>>>>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is
>>>>>>>>> faster. At
>>>>>>>>> >>>>>>>>>>> that time, job is not freed, but s_fence is already
>>>>>>>>> NULL.
>>>>>>>>> >>>>>>>>> No, that case can't happen. See here:
>>>>>>>>> >>>>>>>>>
>>>>>>>>> >>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>>> >>>>>>>>>>
>>>>>>>>> >>>>>>>>>> amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>> >>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>>> >>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>>> >>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>>> >>>>>>>>>> kfree(job);
>>>>>>>>> >>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>> reference
>>>>>>>>> >>>>>>>>> to the
>>>>>>>>> >>>>> s_fence.
>>>>>>>>> >>>>>>>>> So you are just papering over a much bigger problem
>>>>>>>>> here. This
>>>>>>>>> >>>>>>>>> patch is a clear NAK.
>>>>>>>>> >>>>>>>>>
>>>>>>>>> >>>>>>>>> Regards,
>>>>>>>>> >>>>>>>>> Christian.
>>>>>>>>> >>>>>>>>>
>>>>>>>>> >>>>>>>>>>>> When you see a job without an s_fence then that
>>>>>>>>> means the
>>>>>>>>> >>>>>>>>>>>> problem is somewhere else.
>>>>>>>>> >>>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>> Regards,
>>>>>>>>> >>>>>>>>>>>> Christian.
>>>>>>>>> >>>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>> >>>>>>>>>>>>> ---
>>>>>>>>> >>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>|  2
>>>>>>>>> >>>>>>>>>>>>> +- drivers/gpu/drm/scheduler/sched_main.c     |
>>>>>>>>> >>>>>>>>>>>>> 11
>>>>>>>>> ++++++---
>>>>>>>>> >--
>>>>>>>>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6
>>>>>>>>> deletions(-)
>>>>>>>>> >>>>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>>> diff --git
>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> >>>>>>>>>>>>>
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>> >>>>>>>>>>>>> ---
>a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> >>>>>>>>>>>>> +++
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>>> >>> amdgpu_device_gpu_recover(struct
>>>>>>>>> >>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>> >>>>>>>>>>>>>            *
>>>>>>>>> >>>>>>>>>>>>>            * job->base holds a reference to
>>>>>>>>> parent fence
>>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>>> >>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>> >>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>> >>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>>> >>>>>>> &&
>>>>>>>>> >>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence-
>>parent))
>>>>>>>>> >>>>>>>>>>>>> job_signaled = true;
>>>>>>>>> >>>>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>>> diff --git
>>>>>>>>> >>>>>>>>>>>>> a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>>> >drm_sched_increase_karma(struct
>>>>>>>>> >>>>>>>>>>>> drm_sched_job
>>>>>>>>> >>>>>>>>>>>>> *bad)
>>>>>>>>> >>>>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe(entity, tmp,
>>>>>>>>> >>> &rq-
>>>>>>>>> >>>>>>>> entities,
>>>>>>>>> >>>>>>>>>>>> list) {
>>>>>>>>> >>>>>>>>>>>>> -                          if
>>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>>> >>>>>>> ==
>>>>>>>>> >>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>>> >>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>> >>>>>>>>>>>>> scheduled.context ==
>>>>>>>>> >>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>>> >>>>>>>>>>>>> if
>>>>>>>>> >>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>>> >>>>>>>> karma) >
>>>>>>>>> >>>>>>>>>>>>> bad->sched-
>>>>>>>>> >>>> hang_limit)
>>>>>>>>> >>>>>>>>>>>>> if
>>>>>>>>> >>>>>>>>>>>>> (entity-
>>>>>>>>> >>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>> >>>>>>> drm_gpu_scheduler
>>>>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>> >>>>>>>>>>>>>            * This iteration is thread safe as
>>>>>>>>> sched thread
>>>>>>>>> >>>>>>>>>>>>> is
>>>>>>>>> >>> stopped.
>>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>> >>>>>>>>>>>>> +          if (s_job->s_fence &&
>>>>>>>>> s_job->s_fence->parent &&
>>>>>>>>> >>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>>> >>>> s_fence-
>>>>>>>>> >>>>>>>> parent,
>>>>>>>>> >>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>>> >>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>>> >>> @@ -
>>>>>>>>> >>>>>>> 395,7
>>>>>>>>> >>>>>>>>>>> +395,8 @@ void
>>>>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>> >>>>>>>>>>>>> *
>>>>>>>>> >>>>>>>>>>>>> * Job is still alive so fence refcount at
>>>>>>>>> >>> least 1
>>>>>>>>> >>>>>>>>>>>>> */
>>>>>>>>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>>> >>>>>>> false);
>>>>>>>>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>>>>>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>>> >>>>>>>> finished,
>>>>>>>>> >>>>>>>>>>>> false);
>>>>>>>>> >>>>>>>>>>>>> /*
>>>>>>>>> >>>>>>>>>>>>> * We must keep bad job alive for later
>>>>>>>>> >>> use
>>>>>>>>> >>>>>>> during @@
>>>>>>>>> >>>>>>>>>>>> -438,7
>>>>>>>>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>>>>>>>> drm_gpu_scheduler
>>>>>>>>> >>>>> *sched,
>>>>>>>>> >>>>>>>>>>>>> +bool
>>>>>>>>> >>>>>>>>>>>> full_recovery)
>>>>>>>>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>>> >>>>>>>>>>>>>&sched->ring_mirror_list,
>>>>>>>>> >>>>>>>>>>>>> node)
>>>>>>>>> >>>>>>>>>>>> {
>>>>>>>>> >>>>>>>>>>>>> -          struct dma_fence *fence =
>>>>>>>>> s_job->s_fence->parent;
>>>>>>>>> >>>>>>>>>>>>> +          struct dma_fence *fence =
>>>>>>>>> s_job->s_fence ?
>>>>>>>>> >>>>>>>>>>>>> + s_job-
>>>>>>>>> >>>>>>>> s_fence-
>>>>>>>>> >>>>>>>>>>>>> parent :
>>>>>>>>> >>>>>>>>>>>>> +NULL;
>>>>>>>>> >>>>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>>> >>>>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>
>_______________________________________________
>>>>>>>>> >>>>>>>>>>> amd-gfx mailing list amd-gfx@lists.freedesktop.org
>>>>>>>>> >>>>>>>>>>>
>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>>> >
>>>>>>>>> >_______________________________________________
>>>>>>>>> >amd-gfx mailing list
>>>>>>>>> >amd-gfx@lists.freedesktop.org
>>>>>>>>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>
>>>>>>>
>>>>>>> _______________________________________________
>>>>>>> amd-gfx mailing list
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>
>>>>
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-15  4:39                                                                                             ` Deng, Emily
  0 siblings, 0 replies; 80+ messages in thread
From: Deng, Emily @ 2019-11-15  4:39 UTC (permalink / raw)
  To: Grodzovsky, Andrey, Koenig, Christian, amd-gfx

Hi Andrey,
     Currently, I am busying with another issue, maybe will try next week.

Best wishes
Emily Deng



>-----Original Message-----
>From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>Sent: Friday, November 15, 2019 6:14 AM
>To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>
>Attached.
>
>Emily - can you give it a try ?
>
>Andrey
>
>On 11/14/19 3:12 AM, Christian König wrote:
>>> What about instead of peeking at the job to actually remove it from
>>> ring_mirror_list right there,
>> Also an interesting idea. We would need to protect the mirror list
>> with a lock again, but that should be the lesser evil.
>>
>> Maybe prototype that and see if it works or not.
>>
>> Regards,
>> Christian.
>>
>> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>>
>>>
>>> On 11/13/19 9:20 AM, Christian König wrote:
>>>> Another more fundamental question: Could we get rid of the timeout
>>>> job at all?
>>>
>>>
>>> There are other stuff there besides picking the first unfinished job
>>> which is common for all the drivers - such as freeing guilty signaled
>>> job and rearming the timeout work timer.
>>>
>>>
>>>>
>>>> I mean we used to give this as parameter to the scheduler callback
>>>> because we had the timeout worker in the job, but that is no longer
>>>> the case.
>>>>
>>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>>         job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>                                        struct drm_sched_job, node);
>>>>
>>>> Why don't we just remove that here and only get the first job after
>>>> we have stopped the scheduler?
>>>
>>>
>>> Should be ok since we have the extra check for __kthread_should_park
>>> in drm_sched_cleanup_jobs which will protect us in this case from a
>>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs
>>> after we already parked it. The problem here is we need the
>>> drm_sched_job to access the private data for each client driver (see
>>> amdgpu_job_timedout for example). What about instead of peeking at
>>> the job to actually remove it from ring_mirror_list right there, go
>>> ahead with it through the reset routine, if it's signaled in the
>>> meanwhile that great - release it, otherwise put it back into
>>> ring_mirror_list in drm_sched_resubmit_jobs.
>>>
>>> Andrey
>>>
>>>
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>>
>>>>> This why I asked for a trace with timer enabled, but since there is
>>>>> a finite number of places we touch the timer Emily can just put
>>>>> prints there. Also, I wonder if this temp fix helps her with the
>>>>> issue or not.
>>>>>
>>>>> Andrey
>>>>>
>>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>>> The question is where do we rearm the timer for this problem to
>>>>>> occur?
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>>
>>>>>>> I was able to reproduce the crash by using the attached
>>>>>>> simulate_crash.patch - waiting on guilty job to signal in reset
>>>>>>> work and artificially rearming the timeout timer just before the
>>>>>>> check for !cancel_delayed_work(&sched->work_tdr)  in
>>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I
>>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>>
>>>>>>> basic_fix.patch handles this by testing whether another timer
>>>>>>> already armed ob this scheduler or is there a timeout work in
>>>>>>> execution right now (see documentation for work_busy) - obviously
>>>>>>> this is not a full solution as this will not protect from races
>>>>>>> if for example there is immediate work scheduling such as in
>>>>>>> drm_sched_fault -  so we probably need to account for this by
>>>>>>> making drm_sched_cleanup_jobs (at least in the part where it
>>>>>>> iterates ring mirror list and frees jobs) and GPU reset really
>>>>>>> mutually exclusive and not like now.
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>>> Hi Emily,
>>>>>>>>
>>>>>>>> you need to print which scheduler instance is freeing the jobs
>>>>>>>> and which one is triggering the reset. The TID and PID is
>>>>>>>> completely meaningless here since we are called from different
>>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>>
>>>>>>>> Apart from that I will look into this a bit deeper when I have
>>>>>>>> time.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>>> Hi Christian,
>>>>>>>>>     I add the follow print in function drm_sched_cleanup_jobs.
>>>>>>>>> From the log it shows that only use cancel_delayed_work could
>>>>>>>>> not avoid to free job when the sched is in reset. But don’t
>>>>>>>>> know exactly where it is wrong about the driver. Do you have
>>>>>>>>> any suggestion about this?
>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu,
>>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>>         /*
>>>>>>>>>          * Don't destroy jobs while the timeout worker is
>>>>>>>>> running  OR thread
>>>>>>>>>          * is being parked and hence assumed to not touch
>>>>>>>>> ring_mirror_list
>>>>>>>>>          */
>>>>>>>>>          if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>>                 return;
>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n",
>>>>>>>>> current->tgid, current->pid);
>>>>>>>>> Best wishes
>>>>>>>>> Emily Deng
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>ring
>>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587 Nov 12
>>>>>>>>> 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>>> Process information: process pid 0 thread pid 0,
>>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603 Nov 12 12:58:20
>>>>>>>>> ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253,
>>>>>>>>> pid:2253
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262,
>>>>>>>>> pid:2262
>>>>>>>>> >-----Original Message-----
>>>>>>>>> >From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>>> >Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>>> >To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>>>>> ><Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>> for tdr
>>>>>>>>> >
>>>>>>>>> >Thinking more about this claim - we assume here that if
>>>>>>>>> cancel_delayed_work
>>>>>>>>> >returned true it guarantees that timeout work is not running
>>>>>>>>> but, it merely
>>>>>>>>> >means there was a pending timeout work which was removed
>from
>>>>>>>>> >the workqueue before it's timer elapsed and so it didn't have
>>>>>>>>> >a
>>>>>>>>> chance to be
>>>>>>>>> >dequeued and executed, it doesn't cover already executing
>>>>>>>>> work. So there is a
>>>>>>>>> >possibility where while timeout work started executing another
>>>>>>>>> timeout work
>>>>>>>>> >already got enqueued (maybe through earlier cleanup jobs or
>>>>>>>>> through
>>>>>>>>> >drm_sched_fault) and if at this point another
>>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>>> >cancel_delayed_work(&sched->work_tdr) will return true even
>>>>>>>>> while there is a
>>>>>>>>> >timeout job in progress.
>>>>>>>>> >Unfortunately we cannot change cancel_delayed_work to
>>>>>>>>> >cancel_delayed_work_sync to flush the timeout work as timeout
>>>>>>>>> work itself
>>>>>>>>> >waits for schedule thread  to be parked again when calling
>>>>>>>>> park_thread.
>>>>>>>>> >
>>>>>>>>> >Andrey
>>>>>>>>> >
>>>>>>>>> >________________________________________
>>>>>>>>> >From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on
>>>>>>>>> behalf of
>>>>>>>>> >Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> >Sent: 08 November 2019 05:35:18
>>>>>>>>> >To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>>> >Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>> for tdr
>>>>>>>>> >
>>>>>>>>> >Hi Emily,
>>>>>>>>> >
>>>>>>>>> >exactly that can't happen. See here:
>>>>>>>>> >
>>>>>>>>> >>         /* Don't destroy jobs while the timeout worker is
>>>>>>>>> running */
>>>>>>>>> >>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>> >> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>>> >>                 return NULL;
>>>>>>>>> >
>>>>>>>>> >We never free jobs while the timeout working is running to
>>>>>>>>> prevent exactly
>>>>>>>>> >that issue.
>>>>>>>>> >
>>>>>>>>> >Regards,
>>>>>>>>> >Christian.
>>>>>>>>> >
>>>>>>>>> >Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>>> >> Hi Christian,
>>>>>>>>> >>       The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>>>>>>> >amdgpu_device_gpu_recover. I mean the main scheduler free the
>>>>>>>>> jobs while
>>>>>>>>> >in amdgpu_device_gpu_recover, and before calling
>drm_sched_stop.
>>>>>>>>> >>
>>>>>>>>> >> Best wishes
>>>>>>>>> >> Emily Deng
>>>>>>>>> >>
>>>>>>>>> >>
>>>>>>>>> >>
>>>>>>>>> >>> -----Original Message-----
>>>>>>>>> >>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> >>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>>> >>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>> >>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>> for tdr
>>>>>>>>> >>>
>>>>>>>>> >>> Hi Emily,
>>>>>>>>> >>>
>>>>>>>>> >>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>>>>> >>>
>>>>>>>>> >>> When it's not the scheduler we shouldn't have a guilty job
>>>>>>>>> in the first place.
>>>>>>>>> >>>
>>>>>>>>> >>> Regards,
>>>>>>>>> >>> Christian.
>>>>>>>>> >>>
>>>>>>>>> >>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>>> >>>> Hi Chrisitan,
>>>>>>>>> >>>>        No, I am with the new branch and also has the
>>>>>>>>> patch. Even it
>>>>>>>>> >>>> are freed by
>>>>>>>>> >>> main scheduler, how we could avoid main scheduler to free
>>>>>>>>> jobs while
>>>>>>>>> >>> enter to function amdgpu_device_gpu_recover?
>>>>>>>>> >>>> Best wishes
>>>>>>>>> >>>> Emily Deng
>>>>>>>>> >>>>
>>>>>>>>> >>>>
>>>>>>>>> >>>>
>>>>>>>>> >>>>> -----Original Message-----
>>>>>>>>> >>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> >>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>> >>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>> >gfx@lists.freedesktop.org
>>>>>>>>> >>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>> issue for tdr
>>>>>>>>> >>>>>
>>>>>>>>> >>>>> Hi Emily,
>>>>>>>>> >>>>>
>>>>>>>>> >>>>> in this case you are on an old code branch.
>>>>>>>>> >>>>>
>>>>>>>>> >>>>> Jobs are freed now by the main scheduler thread and only
>>>>>>>>> if no
>>>>>>>>> >>>>> timeout handler is running.
>>>>>>>>> >>>>>
>>>>>>>>> >>>>> See this patch here:
>>>>>>>>> >>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>> >>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>> >>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>> >>>>>>
>>>>>>>>> >>>>>>       drm/scheduler: rework job destruction
>>>>>>>>> >>>>> Regards,
>>>>>>>>> >>>>> Christian.
>>>>>>>>> >>>>>
>>>>>>>>> >>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>> >>>>>> Hi Christian,
>>>>>>>>> >>>>>>         Please refer to follow log, when it enter to
>>>>>>>>> >>>>>>amdgpu_device_gpu_recover
>>>>>>>>> >>>>> function, the bad job 000000005086879e is freeing in
>>>>>>>>> function
>>>>>>>>> >>>>> amdgpu_job_free_cb  at the same time, because of the
>>>>>>>>> hardware fence
>>>>>>>>> >>> signal.
>>>>>>>>> >>>>> But amdgpu_device_gpu_recover goes faster, at this case,
>>>>>>>>> >>>>> the s_fence is already freed, but job is not freed in time.
>>>>>>>>> Then this issue
>>>>>>>>> >occurs.
>>>>>>>>> >>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>>> *ERROR* ring
>>>>>>>>> >>> sdma0
>>>>>>>>> >>>>>> timeout, signaled seq=2481, emitted seq=2483 [
>>>>>>>>> >>>>>> 449.793202] [drm:amdgpu_job_timedout [amdgpu]]
>*ERROR*
>>>>>>>>> >>>>>> Process
>>>>>>>>> information:
>>>>>>>>> >>>>> process  pid 0 thread pid 0, s_job:000000005086879e [
>>>>>>>>> 449.794163]
>>>>>>>>> >>>>> amdgpu
>>>>>>>>> >>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>> >>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process
>>>>>>>>> information:
>>>>>>>>> >>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>> 449.794221]
>>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>> pid 0
>>>>>>>>> >>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>> pid 0
>>>>>>>>> >>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>> pid 0
>>>>>>>>> >>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>>>>> >>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>> pid 0
>>>>>>>>> >>>>>> thread pid 0,
>>>>>>>>> >>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>>> >>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>> >>>>> information: process pid 0 thread  pid 0,
>>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>>> >>>>> 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>>> dereference
>>>>>>>>> >>>>> at
>>>>>>>>> >>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0
>[  449.801040]
>>>>>>>>> Oops:
>>>>>>>>> >>>>> 0000 [#1] SMP PTI
>>>>>>>>> >>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted:
>>>>>>>>> G OE
>>>>>>>>> >>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>> >>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX
>+
>>>>>>>>> PIIX,
>>>>>>>>> >>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [
>>>>>>>>> >>>>>> 449.802944]
>>>>>>>>> >>>>>> Workqueue: events drm_sched_job_timedout [amd_sched]
>[
>>>>>>>>> >>>>>> 449.803488]
>>>>>>>>> >>> RIP:
>>>>>>>>> >>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>> >>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85
>>>>>>>>> 56 ff ff
>>>>>>>>> >>>>>> ff
>>>>>>>>> >>>>>> 45 85 e4 0f
>>>>>>>>> >>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48
>>>>>>>>> 8b 40 10
>>>>>>>>> >>>>> <48> 8b
>>>>>>>>> >>> 98
>>>>>>>>> >>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43
>>>>>>>>> 48 a8 01
>>>>>>>>> >>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS:
>>>>>>>>> 00010286 [
>>>>>>>>> >>>>>> 449.806032] RAX: 0000000000000000 RBX:
>0000000000000000
>>>>>>>>> RCX:
>>>>>>>>> >>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0
>RSI:
>>>>>>>>> >>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224]
>RBP:
>>>>>>>>> >>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>>>>>>>>> 0000000000000000 [
>>>>>>>>> >>>>>> 449.807818] R10: 0000000000000000 R11:
>0000000000000148
>>>>>>>>> R12:
>>>>>>>>> >>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0
>R14:
>>>>>>>>> >>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>>>> >>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>> >>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000
>>>>>>>>> ES: 0000
>>>>>>>>> >CR0:
>>>>>>>>> >>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0
>CR3:
>>>>>>>>> >>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747]
>DR0:
>>>>>>>>> >>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>>>>>>>> 0000000000000000 [
>>>>>>>>> >>>>> 449.811344] DR3: 0000000000000000 DR6:
>00000000fffe0ff0 DR7:
>>>>>>>>> >>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>>> >>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140
>[amdgpu]
>>>>>>>>> >>>>>> [ 449.812635] drm_sched_job_timedout+0x44/0x90
>>>>>>>>> >>>>>> [amd_sched] [ 449.813139]  ?
>>>>>>>>> >>>>>> amdgpu_cgs_destroy_device+0x10/0x10
>>>>>>>>> [amdgpu] [
>>>>>>>>> >>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90
>>>>>>>>> [amd_sched] [
>>>>>>>>> >>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>>>> >>>>>> worker_thread+0x34/0x410 [ 449.814728]
>>>>>>>>> kthread+0x121/0x140 [
>>>>>>>>> >>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [
>>>>>>>>> 449.815374]  ?
>>>>>>>>> >>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>> >>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>>> >>>>>>
>>>>>>>>> >>>>>>> -----Original Message-----
>>>>>>>>> >>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> >>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>> >>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>> >>> gfx@lists.freedesktop.org
>>>>>>>>> >>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>> issue for
>>>>>>>>> >>>>>>> tdr
>>>>>>>>> >>>>>>>
>>>>>>>>> >>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>> >>>>>>>> Sorry, please take your time.
>>>>>>>>> >>>>>>> Have you seen my other response a bit below?
>>>>>>>>> >>>>>>>
>>>>>>>>> >>>>>>> I can't follow how it would be possible for
>>>>>>>>> job->s_fence to be
>>>>>>>>> >>>>>>> NULL without the job also being freed.
>>>>>>>>> >>>>>>>
>>>>>>>>> >>>>>>> So it looks like this patch is just papering over some
>>>>>>>>> bigger issues.
>>>>>>>>> >>>>>>>
>>>>>>>>> >>>>>>> Regards,
>>>>>>>>> >>>>>>> Christian.
>>>>>>>>> >>>>>>>
>>>>>>>>> >>>>>>>> Best wishes
>>>>>>>>> >>>>>>>> Emily Deng
>>>>>>>>> >>>>>>>>
>>>>>>>>> >>>>>>>>
>>>>>>>>> >>>>>>>>
>>>>>>>>> >>>>>>>>> -----Original Message-----
>>>>>>>>> >>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> >>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>> >>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>> >>>>> gfx@lists.freedesktop.org
>>>>>>>>> >>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>> issue for
>>>>>>>>> >>>>>>>>> tdr
>>>>>>>>> >>>>>>>>>
>>>>>>>>> >>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>> >>>>>>>>>> Ping.....
>>>>>>>>> >>>>>>>>> You need to give me at least enough time to wake up
>>>>>>>>> >>>>>>>>> :)
>>>>>>>>> >>>>>>>>>
>>>>>>>>> >>>>>>>>>> Best wishes
>>>>>>>>> >>>>>>>>>> Emily Deng
>>>>>>>>> >>>>>>>>>>
>>>>>>>>> >>>>>>>>>>
>>>>>>>>> >>>>>>>>>>
>>>>>>>>> >>>>>>>>>>> -----Original Message-----
>>>>>>>>> >>>>>>>>>>> From: amd-gfx
>>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>>> >>> Behalf
>>>>>>>>> >>>>>>>>>>> Of Deng, Emily
>>>>>>>>> >>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>> >>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>;
>>>>>>>>> >>>>>>>>>>> amd- gfx@lists.freedesktop.org
>>>>>>>>> >>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>> pointer issue
>>>>>>>>> >>>>>>>>>>> for tdr
>>>>>>>>> >>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>> -----Original Message-----
>>>>>>>>> >>>>>>>>>>>> From: Christian König
>>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>> >>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>> >>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>> >>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>> >>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>> pointer issue
>>>>>>>>> >>>>>>>>>>>> for tdr
>>>>>>>>> >>>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>> >>>>>>>>>>>>> When the job is already signaled, the s_fence is
>>>>>>>>> freed.
>>>>>>>>> >>>>>>>>>>>>> Then it will has null pointer in
>>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>>> >>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job
>>>>>>>>> is destroyed.
>>>>>>>>> >>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>> >>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup.
>>>>>>>>> But in one
>>>>>>>>> >>>>>>>>>>> case, when it enter into the
>>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>>> >>>>>>>>>>> already in drm_sched_job_cleanup, and at this time,
>>>>>>>>> it will
>>>>>>>>> >>>>>>>>>>> go to free
>>>>>>>>> >>>>> job.
>>>>>>>>> >>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is
>>>>>>>>> faster. At
>>>>>>>>> >>>>>>>>>>> that time, job is not freed, but s_fence is already
>>>>>>>>> NULL.
>>>>>>>>> >>>>>>>>> No, that case can't happen. See here:
>>>>>>>>> >>>>>>>>>
>>>>>>>>> >>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>>> >>>>>>>>>>
>>>>>>>>> >>>>>>>>>> amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>> >>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>>> >>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>>> >>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>>> >>>>>>>>>> kfree(job);
>>>>>>>>> >>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>> reference
>>>>>>>>> >>>>>>>>> to the
>>>>>>>>> >>>>> s_fence.
>>>>>>>>> >>>>>>>>> So you are just papering over a much bigger problem
>>>>>>>>> here. This
>>>>>>>>> >>>>>>>>> patch is a clear NAK.
>>>>>>>>> >>>>>>>>>
>>>>>>>>> >>>>>>>>> Regards,
>>>>>>>>> >>>>>>>>> Christian.
>>>>>>>>> >>>>>>>>>
>>>>>>>>> >>>>>>>>>>>> When you see a job without an s_fence then that
>>>>>>>>> means the
>>>>>>>>> >>>>>>>>>>>> problem is somewhere else.
>>>>>>>>> >>>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>> Regards,
>>>>>>>>> >>>>>>>>>>>> Christian.
>>>>>>>>> >>>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>> >>>>>>>>>>>>> ---
>>>>>>>>> >>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>|  2
>>>>>>>>> >>>>>>>>>>>>> +- drivers/gpu/drm/scheduler/sched_main.c     |
>>>>>>>>> >>>>>>>>>>>>> 11
>>>>>>>>> ++++++---
>>>>>>>>> >--
>>>>>>>>> >>>>>>>>>>>>>       2 files changed, 7 insertions(+), 6
>>>>>>>>> deletions(-)
>>>>>>>>> >>>>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>>> diff --git
>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> >>>>>>>>>>>>>
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> >>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>> >>>>>>>>>>>>> ---
>a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> >>>>>>>>>>>>> +++
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> >>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>>> >>> amdgpu_device_gpu_recover(struct
>>>>>>>>> >>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>> >>>>>>>>>>>>>            *
>>>>>>>>> >>>>>>>>>>>>>            * job->base holds a reference to
>>>>>>>>> parent fence
>>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>>> >>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>> >>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>> >>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>>> >>>>>>> &&
>>>>>>>>> >>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence-
>>parent))
>>>>>>>>> >>>>>>>>>>>>> job_signaled = true;
>>>>>>>>> >>>>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>>> diff --git
>>>>>>>>> >>>>>>>>>>>>> a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>> >>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>> >>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>> >>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>> >>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>> >>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>>> >drm_sched_increase_karma(struct
>>>>>>>>> >>>>>>>>>>>> drm_sched_job
>>>>>>>>> >>>>>>>>>>>>> *bad)
>>>>>>>>> >>>>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe(entity, tmp,
>>>>>>>>> >>> &rq-
>>>>>>>>> >>>>>>>> entities,
>>>>>>>>> >>>>>>>>>>>> list) {
>>>>>>>>> >>>>>>>>>>>>> -                          if
>>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>>> >>>>>>> ==
>>>>>>>>> >>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>>> >>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>>> >>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>> >>>>>>>>>>>>> scheduled.context ==
>>>>>>>>> >>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>>> >>>>>>>>>>>>> if
>>>>>>>>> >>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>>> >>>>>>>> karma) >
>>>>>>>>> >>>>>>>>>>>>> bad->sched-
>>>>>>>>> >>>> hang_limit)
>>>>>>>>> >>>>>>>>>>>>> if
>>>>>>>>> >>>>>>>>>>>>> (entity-
>>>>>>>>> >>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>> >>>>>>> drm_gpu_scheduler
>>>>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>> >>>>>>>>>>>>>            * This iteration is thread safe as
>>>>>>>>> sched thread
>>>>>>>>> >>>>>>>>>>>>> is
>>>>>>>>> >>> stopped.
>>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>> >>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>> >>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>> >>>>>>>>>>>>> +          if (s_job->s_fence &&
>>>>>>>>> s_job->s_fence->parent &&
>>>>>>>>> >>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>>> >>>> s_fence-
>>>>>>>>> >>>>>>>> parent,
>>>>>>>>> >>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>>> >>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>>> >>> @@ -
>>>>>>>>> >>>>>>> 395,7
>>>>>>>>> >>>>>>>>>>> +395,8 @@ void
>>>>>>>>> >>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>> >>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>> >>>>>>>>>>>>> *
>>>>>>>>> >>>>>>>>>>>>> * Job is still alive so fence refcount at
>>>>>>>>> >>> least 1
>>>>>>>>> >>>>>>>>>>>>> */
>>>>>>>>> >>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>>> >>>>>>> false);
>>>>>>>>> >>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>>>>>> >>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>>> >>>>>>>> finished,
>>>>>>>>> >>>>>>>>>>>> false);
>>>>>>>>> >>>>>>>>>>>>> /*
>>>>>>>>> >>>>>>>>>>>>> * We must keep bad job alive for later
>>>>>>>>> >>> use
>>>>>>>>> >>>>>>> during @@
>>>>>>>>> >>>>>>>>>>>> -438,7
>>>>>>>>> >>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>>>>>>>> drm_gpu_scheduler
>>>>>>>>> >>>>> *sched,
>>>>>>>>> >>>>>>>>>>>>> +bool
>>>>>>>>> >>>>>>>>>>>> full_recovery)
>>>>>>>>> >>>>>>>>>>>>>            * GPU recovers can't run in parallel.
>>>>>>>>> >>>>>>>>>>>>>            */
>>>>>>>>> >>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>>> >>>>>>>>>>>>>&sched->ring_mirror_list,
>>>>>>>>> >>>>>>>>>>>>> node)
>>>>>>>>> >>>>>>>>>>>> {
>>>>>>>>> >>>>>>>>>>>>> -          struct dma_fence *fence =
>>>>>>>>> s_job->s_fence->parent;
>>>>>>>>> >>>>>>>>>>>>> +          struct dma_fence *fence =
>>>>>>>>> s_job->s_fence ?
>>>>>>>>> >>>>>>>>>>>>> + s_job-
>>>>>>>>> >>>>>>>> s_fence-
>>>>>>>>> >>>>>>>>>>>>> parent :
>>>>>>>>> >>>>>>>>>>>>> +NULL;
>>>>>>>>> >>>>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>>> >>>>>>>>>>>>>
>>>>>>>>> >>>>>>>>>>>
>_______________________________________________
>>>>>>>>> >>>>>>>>>>> amd-gfx mailing list amd-gfx@lists.freedesktop.org
>>>>>>>>> >>>>>>>>>>>
>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>>> >
>>>>>>>>> >_______________________________________________
>>>>>>>>> >amd-gfx mailing list
>>>>>>>>> >amd-gfx@lists.freedesktop.org
>>>>>>>>> >https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>
>>>>>>>
>>>>>>> _______________________________________________
>>>>>>> amd-gfx mailing list
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>
>>>>
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-18 14:07                                                                                                 ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-18 14:07 UTC (permalink / raw)
  To: Deng, Emily, Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Thanks Emily.

Christan - ping for review.

Andrey

On 11/14/19 11:39 PM, Deng, Emily wrote:
> Hi Andrey,
>       Currently, I am busying with another issue, maybe will try next week.
>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>> Sent: Friday, November 15, 2019 6:14 AM
>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>> Attached.
>>
>> Emily - can you give it a try ?
>>
>> Andrey
>>
>> On 11/14/19 3:12 AM, Christian König wrote:
>>>> What about instead of peeking at the job to actually remove it from
>>>> ring_mirror_list right there,
>>> Also an interesting idea. We would need to protect the mirror list
>>> with a lock again, but that should be the lesser evil.
>>>
>>> Maybe prototype that and see if it works or not.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>>>
>>>> On 11/13/19 9:20 AM, Christian König wrote:
>>>>> Another more fundamental question: Could we get rid of the timeout
>>>>> job at all?
>>>>
>>>> There are other stuff there besides picking the first unfinished job
>>>> which is common for all the drivers - such as freeing guilty signaled
>>>> job and rearming the timeout work timer.
>>>>
>>>>
>>>>> I mean we used to give this as parameter to the scheduler callback
>>>>> because we had the timeout worker in the job, but that is no longer
>>>>> the case.
>>>>>
>>>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>>>          job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>                                         struct drm_sched_job, node);
>>>>> Why don't we just remove that here and only get the first job after
>>>>> we have stopped the scheduler?
>>>>
>>>> Should be ok since we have the extra check for __kthread_should_park
>>>> in drm_sched_cleanup_jobs which will protect us in this case from a
>>>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs
>>>> after we already parked it. The problem here is we need the
>>>> drm_sched_job to access the private data for each client driver (see
>>>> amdgpu_job_timedout for example). What about instead of peeking at
>>>> the job to actually remove it from ring_mirror_list right there, go
>>>> ahead with it through the reset routine, if it's signaled in the
>>>> meanwhile that great - release it, otherwise put it back into
>>>> ring_mirror_list in drm_sched_resubmit_jobs.
>>>>
>>>> Andrey
>>>>
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>>> This why I asked for a trace with timer enabled, but since there is
>>>>>> a finite number of places we touch the timer Emily can just put
>>>>>> prints there. Also, I wonder if this temp fix helps her with the
>>>>>> issue or not.
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>>>> The question is where do we rearm the timer for this problem to
>>>>>>> occur?
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>>> I was able to reproduce the crash by using the attached
>>>>>>>> simulate_crash.patch - waiting on guilty job to signal in reset
>>>>>>>> work and artificially rearming the timeout timer just before the
>>>>>>>> check for !cancel_delayed_work(&sched->work_tdr)  in
>>>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I
>>>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>>>
>>>>>>>> basic_fix.patch handles this by testing whether another timer
>>>>>>>> already armed ob this scheduler or is there a timeout work in
>>>>>>>> execution right now (see documentation for work_busy) - obviously
>>>>>>>> this is not a full solution as this will not protect from races
>>>>>>>> if for example there is immediate work scheduling such as in
>>>>>>>> drm_sched_fault -  so we probably need to account for this by
>>>>>>>> making drm_sched_cleanup_jobs (at least in the part where it
>>>>>>>> iterates ring mirror list and frees jobs) and GPU reset really
>>>>>>>> mutually exclusive and not like now.
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>>
>>>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>>>> Hi Emily,
>>>>>>>>>
>>>>>>>>> you need to print which scheduler instance is freeing the jobs
>>>>>>>>> and which one is triggering the reset. The TID and PID is
>>>>>>>>> completely meaningless here since we are called from different
>>>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>>>
>>>>>>>>> Apart from that I will look into this a bit deeper when I have
>>>>>>>>> time.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>>>> Hi Christian,
>>>>>>>>>>      I add the follow print in function drm_sched_cleanup_jobs.
>>>>>>>>>>  From the log it shows that only use cancel_delayed_work could
>>>>>>>>>> not avoid to free job when the sched is in reset. But don’t
>>>>>>>>>> know exactly where it is wrong about the driver. Do you have
>>>>>>>>>> any suggestion about this?
>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu,
>>>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>>>          /*
>>>>>>>>>>           * Don't destroy jobs while the timeout worker is
>>>>>>>>>> running  OR thread
>>>>>>>>>>           * is being parked and hence assumed to not touch
>>>>>>>>>> ring_mirror_list
>>>>>>>>>>           */
>>>>>>>>>>           if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>>>                  return;
>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n",
>>>>>>>>>> current->tgid, current->pid);
>>>>>>>>>> Best wishes
>>>>>>>>>> Emily Deng
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>> ring
>>>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587 Nov 12
>>>>>>>>>> 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>>>> Process information: process pid 0 thread pid 0,
>>>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603 Nov 12 12:58:20
>>>>>>>>>> ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253,
>>>>>>>>>> pid:2253
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>>>>> Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>> for tdr
>>>>>>>>>>> Thinking more about this claim - we assume here that if
>>>>>>>>>> cancel_delayed_work
>>>>>>>>>>> returned true it guarantees that timeout work is not running
>>>>>>>>>> but, it merely
>>>>>>>>>>> means there was a pending timeout work which was removed
>> from
>>>>>>>>>>> the workqueue before it's timer elapsed and so it didn't have
>>>>>>>>>>> a
>>>>>>>>>> chance to be
>>>>>>>>>>> dequeued and executed, it doesn't cover already executing
>>>>>>>>>> work. So there is a
>>>>>>>>>>> possibility where while timeout work started executing another
>>>>>>>>>> timeout work
>>>>>>>>>>> already got enqueued (maybe through earlier cleanup jobs or
>>>>>>>>>> through
>>>>>>>>>>> drm_sched_fault) and if at this point another
>>>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>>>>> cancel_delayed_work(&sched->work_tdr) will return true even
>>>>>>>>>> while there is a
>>>>>>>>>>> timeout job in progress.
>>>>>>>>>>> Unfortunately we cannot change cancel_delayed_work to
>>>>>>>>>>> cancel_delayed_work_sync to flush the timeout work as timeout
>>>>>>>>>> work itself
>>>>>>>>>>> waits for schedule thread  to be parked again when calling
>>>>>>>>>> park_thread.
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>> ________________________________________
>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on
>>>>>>>>>> behalf of
>>>>>>>>>>> Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>> Sent: 08 November 2019 05:35:18
>>>>>>>>>>> To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>> for tdr
>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>
>>>>>>>>>>> exactly that can't happen. See here:
>>>>>>>>>>>
>>>>>>>>>>>>           /* Don't destroy jobs while the timeout worker is
>>>>>>>>>> running */
>>>>>>>>>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>>>>>>                   return NULL;
>>>>>>>>>>> We never free jobs while the timeout working is running to
>>>>>>>>>> prevent exactly
>>>>>>>>>>> that issue.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>         The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>>>>>>>>> amdgpu_device_gpu_recover. I mean the main scheduler free the
>>>>>>>>>> jobs while
>>>>>>>>>>> in amdgpu_device_gpu_recover, and before calling
>> drm_sched_stop.
>>>>>>>>>>>> Best wishes
>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>> for tdr
>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>
>>>>>>>>>>>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>>>>>>>>>
>>>>>>>>>>>>> When it's not the scheduler we shouldn't have a guilty job
>>>>>>>>>> in the first place.
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>>>>>>>> Hi Chrisitan,
>>>>>>>>>>>>>>          No, I am with the new branch and also has the
>>>>>>>>>> patch. Even it
>>>>>>>>>>>>>> are freed by
>>>>>>>>>>>>> main scheduler, how we could avoid main scheduler to free
>>>>>>>>>> jobs while
>>>>>>>>>>>>> enter to function amdgpu_device_gpu_recover?
>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> in this case you are on an old code branch.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Jobs are freed now by the main scheduler thread and only
>>>>>>>>>> if no
>>>>>>>>>>>>>>> timeout handler is running.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> See this patch here:
>>>>>>>>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>           Please refer to follow log, when it enter to
>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>>>>>>>>> function, the bad job 000000005086879e is freeing in
>>>>>>>>>> function
>>>>>>>>>>>>>>> amdgpu_job_free_cb  at the same time, because of the
>>>>>>>>>> hardware fence
>>>>>>>>>>>>> signal.
>>>>>>>>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this case,
>>>>>>>>>>>>>>> the s_fence is already freed, but job is not freed in time.
>>>>>>>>>> Then this issue
>>>>>>>>>>> occurs.
>>>>>>>>>>>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>>>> *ERROR* ring
>>>>>>>>>>>>> sdma0
>>>>>>>>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [
>>>>>>>>>>>>>>>> 449.793202] [drm:amdgpu_job_timedout [amdgpu]]
>> *ERROR*
>>>>>>>>>>>>>>>> Process
>>>>>>>>>> information:
>>>>>>>>>>>>>>> process  pid 0 thread pid 0, s_job:000000005086879e [
>>>>>>>>>> 449.794163]
>>>>>>>>>>>>>>> amdgpu
>>>>>>>>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process
>>>>>>>>>> information:
>>>>>>>>>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>>> 449.794221]
>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>> thread pid 0,
>>>>>>>>>>>>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>> information: process pid 0 thread  pid 0,
>>>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>>>>>>>>> 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>>>> dereference
>>>>>>>>>>>>>>> at
>>>>>>>>>>>>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0
>> [  449.801040]
>>>>>>>>>> Oops:
>>>>>>>>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>>>>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted:
>>>>>>>>>> G OE
>>>>>>>>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>>>>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX
>> +
>>>>>>>>>> PIIX,
>>>>>>>>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [
>>>>>>>>>>>>>>>> 449.802944]
>>>>>>>>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched]
>> [
>>>>>>>>>>>>>>>> 449.803488]
>>>>>>>>>>>>> RIP:
>>>>>>>>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>>>>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85
>>>>>>>>>> 56 ff ff
>>>>>>>>>>>>>>>> ff
>>>>>>>>>>>>>>>> 45 85 e4 0f
>>>>>>>>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48
>>>>>>>>>> 8b 40 10
>>>>>>>>>>>>>>> <48> 8b
>>>>>>>>>>>>> 98
>>>>>>>>>>>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43
>>>>>>>>>> 48 a8 01
>>>>>>>>>>>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS:
>>>>>>>>>> 00010286 [
>>>>>>>>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX:
>> 0000000000000000
>>>>>>>>>> RCX:
>>>>>>>>>>>>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0
>> RSI:
>>>>>>>>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224]
>> RBP:
>>>>>>>>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>> 449.807818] R10: 0000000000000000 R11:
>> 0000000000000148
>>>>>>>>>> R12:
>>>>>>>>>>>>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0
>> R14:
>>>>>>>>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>>>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>>>>>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000
>>>>>>>>>> ES: 0000
>>>>>>>>>>> CR0:
>>>>>>>>>>>>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0
>> CR3:
>>>>>>>>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747]
>> DR0:
>>>>>>>>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>> 449.811344] DR3: 0000000000000000 DR6:
>> 00000000fffe0ff0 DR7:
>>>>>>>>>>>>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>>>>>>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140
>> [amdgpu]
>>>>>>>>>>>>>>>> [ 449.812635] drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>>> [amd_sched] [ 449.813139]  ?
>>>>>>>>>>>>>>>> amdgpu_cgs_destroy_device+0x10/0x10
>>>>>>>>>> [amdgpu] [
>>>>>>>>>>>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90
>>>>>>>>>> [amd_sched] [
>>>>>>>>>>>>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>>>>>>>>>>> worker_thread+0x34/0x410 [ 449.814728]
>>>>>>>>>> kthread+0x121/0x140 [
>>>>>>>>>>>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [
>>>>>>>>>> 449.815374]  ?
>>>>>>>>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>>>>>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> I can't follow how it would be possible for
>>>>>>>>>> job->s_fence to be
>>>>>>>>>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> So it looks like this patch is just papering over some
>>>>>>>>>> bigger issues.
>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>>>>>>>>> You need to give me at least enough time to wake up
>>>>>>>>>>>>>>>>>>> :)
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>> From: amd-gfx
>>>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>>>>>>> Behalf
>>>>>>>>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>;
>>>>>>>>>>>>>>>>>>>>> amd- gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>> From: Christian König
>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>>>>>>>>> When the job is already signaled, the s_fence is
>>>>>>>>>> freed.
>>>>>>>>>>>>>>>>>>>>>>> Then it will has null pointer in
>>>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job
>>>>>>>>>> is destroyed.
>>>>>>>>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup.
>>>>>>>>>> But in one
>>>>>>>>>>>>>>>>>>>>> case, when it enter into the
>>>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>>>>>>>>>>>>>>> already in drm_sched_job_cleanup, and at this time,
>>>>>>>>>> it will
>>>>>>>>>>>>>>>>>>>>> go to free
>>>>>>>>>>>>>>> job.
>>>>>>>>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is
>>>>>>>>>> faster. At
>>>>>>>>>>>>>>>>>>>>> that time, job is not freed, but s_fence is already
>>>>>>>>>> NULL.
>>>>>>>>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>>>>>>>> kfree(job);
>>>>>>>>>>>>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>>> reference
>>>>>>>>>>>>>>>>>>> to the
>>>>>>>>>>>>>>> s_fence.
>>>>>>>>>>>>>>>>>>> So you are just papering over a much bigger problem
>>>>>>>>>> here. This
>>>>>>>>>>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> When you see a job without an s_fence then that
>>>>>>>>>> means the
>>>>>>>>>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> |  2
>>>>>>>>>>>>>>>>>>>>>>> +- drivers/gpu/drm/scheduler/sched_main.c     |
>>>>>>>>>>>>>>>>>>>>>>> 11
>>>>>>>>>> ++++++---
>>>>>>>>>>> --
>>>>>>>>>>>>>>>>>>>>>>>         2 files changed, 7 insertions(+), 6
>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>>>>>>>>> ---
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>> +++
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>              *
>>>>>>>>>>>>>>>>>>>>>>>              * job->base holds a reference to
>>>>>>>>>> parent fence
>>>>>>>>>>>>>>>>>>>>>>>              */
>>>>>>>>>>>>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>>>>>>>>>>> &&
>>>>>>>>>>>>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence-
>>> parent))
>>>>>>>>>>>>>>>>>>>>>>> job_signaled = true;
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>>>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(entity, tmp,
>>>>>>>>>>>>> &rq-
>>>>>>>>>>>>>>>>>> entities,
>>>>>>>>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>>>>>>>>> -                          if
>>>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>>>>>>>>>>> ==
>>>>>>>>>>>>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>>>>>>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>>>>>>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>>>>>>>> bad->sched-
>>>>>>>>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>> (entity-
>>>>>>>>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>              * This iteration is thread safe as
>>>>>>>>>> sched thread
>>>>>>>>>>>>>>>>>>>>>>> is
>>>>>>>>>>>>> stopped.
>>>>>>>>>>>>>>>>>>>>>>>              */
>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>> +          if (s_job->s_fence &&
>>>>>>>>>> s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>>>>>>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>>>>>>> @@ -
>>>>>>>>>>>>>>>>> 395,7
>>>>>>>>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>> * Job is still alive so fence refcount at
>>>>>>>>>>>>> least 1
>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>>>>>>>>>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>>>>>>>>>>>> finished,
>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>> /*
>>>>>>>>>>>>>>>>>>>>>>> * We must keep bad job alive for later
>>>>>>>>>>>>> use
>>>>>>>>>>>>>>>>> during @@
>>>>>>>>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>> *sched,
>>>>>>>>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>>>>>>>>              * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>>>>>>>>              */
>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>>>>>>> -          struct dma_fence *fence =
>>>>>>>>>> s_job->s_fence->parent;
>>>>>>>>>>>>>>>>>>>>>>> +          struct dma_fence *fence =
>>>>>>>>>> s_job->s_fence ?
>>>>>>>>>>>>>>>>>>>>>>> + s_job-
>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>>>>>>>
>> _______________________________________________
>>>>>>>>>>>>>>>>>>>>> amd-gfx mailing list amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>>>>> _______________________________________________
>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>> _______________________________________________
>>>>>>>> amd-gfx mailing list
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-18 14:07                                                                                                 ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-18 14:07 UTC (permalink / raw)
  To: Deng, Emily, Koenig, Christian, amd-gfx

Thanks Emily.

Christan - ping for review.

Andrey

On 11/14/19 11:39 PM, Deng, Emily wrote:
> Hi Andrey,
>       Currently, I am busying with another issue, maybe will try next week.
>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>> Sent: Friday, November 15, 2019 6:14 AM
>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>
>> Attached.
>>
>> Emily - can you give it a try ?
>>
>> Andrey
>>
>> On 11/14/19 3:12 AM, Christian König wrote:
>>>> What about instead of peeking at the job to actually remove it from
>>>> ring_mirror_list right there,
>>> Also an interesting idea. We would need to protect the mirror list
>>> with a lock again, but that should be the lesser evil.
>>>
>>> Maybe prototype that and see if it works or not.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>>>
>>>> On 11/13/19 9:20 AM, Christian König wrote:
>>>>> Another more fundamental question: Could we get rid of the timeout
>>>>> job at all?
>>>>
>>>> There are other stuff there besides picking the first unfinished job
>>>> which is common for all the drivers - such as freeing guilty signaled
>>>> job and rearming the timeout work timer.
>>>>
>>>>
>>>>> I mean we used to give this as parameter to the scheduler callback
>>>>> because we had the timeout worker in the job, but that is no longer
>>>>> the case.
>>>>>
>>>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>>>          job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>                                         struct drm_sched_job, node);
>>>>> Why don't we just remove that here and only get the first job after
>>>>> we have stopped the scheduler?
>>>>
>>>> Should be ok since we have the extra check for __kthread_should_park
>>>> in drm_sched_cleanup_jobs which will protect us in this case from a
>>>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs
>>>> after we already parked it. The problem here is we need the
>>>> drm_sched_job to access the private data for each client driver (see
>>>> amdgpu_job_timedout for example). What about instead of peeking at
>>>> the job to actually remove it from ring_mirror_list right there, go
>>>> ahead with it through the reset routine, if it's signaled in the
>>>> meanwhile that great - release it, otherwise put it back into
>>>> ring_mirror_list in drm_sched_resubmit_jobs.
>>>>
>>>> Andrey
>>>>
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>>> This why I asked for a trace with timer enabled, but since there is
>>>>>> a finite number of places we touch the timer Emily can just put
>>>>>> prints there. Also, I wonder if this temp fix helps her with the
>>>>>> issue or not.
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>>>> The question is where do we rearm the timer for this problem to
>>>>>>> occur?
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>>> I was able to reproduce the crash by using the attached
>>>>>>>> simulate_crash.patch - waiting on guilty job to signal in reset
>>>>>>>> work and artificially rearming the timeout timer just before the
>>>>>>>> check for !cancel_delayed_work(&sched->work_tdr)  in
>>>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I
>>>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>>>
>>>>>>>> basic_fix.patch handles this by testing whether another timer
>>>>>>>> already armed ob this scheduler or is there a timeout work in
>>>>>>>> execution right now (see documentation for work_busy) - obviously
>>>>>>>> this is not a full solution as this will not protect from races
>>>>>>>> if for example there is immediate work scheduling such as in
>>>>>>>> drm_sched_fault -  so we probably need to account for this by
>>>>>>>> making drm_sched_cleanup_jobs (at least in the part where it
>>>>>>>> iterates ring mirror list and frees jobs) and GPU reset really
>>>>>>>> mutually exclusive and not like now.
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>>
>>>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>>>> Hi Emily,
>>>>>>>>>
>>>>>>>>> you need to print which scheduler instance is freeing the jobs
>>>>>>>>> and which one is triggering the reset. The TID and PID is
>>>>>>>>> completely meaningless here since we are called from different
>>>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>>>
>>>>>>>>> Apart from that I will look into this a bit deeper when I have
>>>>>>>>> time.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>>>> Hi Christian,
>>>>>>>>>>      I add the follow print in function drm_sched_cleanup_jobs.
>>>>>>>>>>  From the log it shows that only use cancel_delayed_work could
>>>>>>>>>> not avoid to free job when the sched is in reset. But don’t
>>>>>>>>>> know exactly where it is wrong about the driver. Do you have
>>>>>>>>>> any suggestion about this?
>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu,
>>>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>>>          /*
>>>>>>>>>>           * Don't destroy jobs while the timeout worker is
>>>>>>>>>> running  OR thread
>>>>>>>>>>           * is being parked and hence assumed to not touch
>>>>>>>>>> ring_mirror_list
>>>>>>>>>>           */
>>>>>>>>>>           if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>>>                  return;
>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n",
>>>>>>>>>> current->tgid, current->pid);
>>>>>>>>>> Best wishes
>>>>>>>>>> Emily Deng
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>> ring
>>>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587 Nov 12
>>>>>>>>>> 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>>>> Process information: process pid 0 thread pid 0,
>>>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603 Nov 12 12:58:20
>>>>>>>>>> ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253,
>>>>>>>>>> pid:2253
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262,
>>>>>>>>>> pid:2262
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>>>>> Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>> for tdr
>>>>>>>>>>> Thinking more about this claim - we assume here that if
>>>>>>>>>> cancel_delayed_work
>>>>>>>>>>> returned true it guarantees that timeout work is not running
>>>>>>>>>> but, it merely
>>>>>>>>>>> means there was a pending timeout work which was removed
>> from
>>>>>>>>>>> the workqueue before it's timer elapsed and so it didn't have
>>>>>>>>>>> a
>>>>>>>>>> chance to be
>>>>>>>>>>> dequeued and executed, it doesn't cover already executing
>>>>>>>>>> work. So there is a
>>>>>>>>>>> possibility where while timeout work started executing another
>>>>>>>>>> timeout work
>>>>>>>>>>> already got enqueued (maybe through earlier cleanup jobs or
>>>>>>>>>> through
>>>>>>>>>>> drm_sched_fault) and if at this point another
>>>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>>>>> cancel_delayed_work(&sched->work_tdr) will return true even
>>>>>>>>>> while there is a
>>>>>>>>>>> timeout job in progress.
>>>>>>>>>>> Unfortunately we cannot change cancel_delayed_work to
>>>>>>>>>>> cancel_delayed_work_sync to flush the timeout work as timeout
>>>>>>>>>> work itself
>>>>>>>>>>> waits for schedule thread  to be parked again when calling
>>>>>>>>>> park_thread.
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>> ________________________________________
>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on
>>>>>>>>>> behalf of
>>>>>>>>>>> Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>> Sent: 08 November 2019 05:35:18
>>>>>>>>>>> To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>> for tdr
>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>
>>>>>>>>>>> exactly that can't happen. See here:
>>>>>>>>>>>
>>>>>>>>>>>>           /* Don't destroy jobs while the timeout worker is
>>>>>>>>>> running */
>>>>>>>>>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>>>>>>                   return NULL;
>>>>>>>>>>> We never free jobs while the timeout working is running to
>>>>>>>>>> prevent exactly
>>>>>>>>>>> that issue.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>         The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>>>>>>>>> amdgpu_device_gpu_recover. I mean the main scheduler free the
>>>>>>>>>> jobs while
>>>>>>>>>>> in amdgpu_device_gpu_recover, and before calling
>> drm_sched_stop.
>>>>>>>>>>>> Best wishes
>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>> for tdr
>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>
>>>>>>>>>>>>> well who is calling amdgpu_device_gpu_recover() in this case?
>>>>>>>>>>>>>
>>>>>>>>>>>>> When it's not the scheduler we shouldn't have a guilty job
>>>>>>>>>> in the first place.
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>>>>>>>> Hi Chrisitan,
>>>>>>>>>>>>>>          No, I am with the new branch and also has the
>>>>>>>>>> patch. Even it
>>>>>>>>>>>>>> are freed by
>>>>>>>>>>>>> main scheduler, how we could avoid main scheduler to free
>>>>>>>>>> jobs while
>>>>>>>>>>>>> enter to function amdgpu_device_gpu_recover?
>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> in this case you are on an old code branch.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Jobs are freed now by the main scheduler thread and only
>>>>>>>>>> if no
>>>>>>>>>>>>>>> timeout handler is running.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> See this patch here:
>>>>>>>>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>           Please refer to follow log, when it enter to
>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>>>>>>>>> function, the bad job 000000005086879e is freeing in
>>>>>>>>>> function
>>>>>>>>>>>>>>> amdgpu_job_free_cb  at the same time, because of the
>>>>>>>>>> hardware fence
>>>>>>>>>>>>> signal.
>>>>>>>>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this case,
>>>>>>>>>>>>>>> the s_fence is already freed, but job is not freed in time.
>>>>>>>>>> Then this issue
>>>>>>>>>>> occurs.
>>>>>>>>>>>>>>>> [  449.792189] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>>>> *ERROR* ring
>>>>>>>>>>>>> sdma0
>>>>>>>>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [
>>>>>>>>>>>>>>>> 449.793202] [drm:amdgpu_job_timedout [amdgpu]]
>> *ERROR*
>>>>>>>>>>>>>>>> Process
>>>>>>>>>> information:
>>>>>>>>>>>>>>> process  pid 0 thread pid 0, s_job:000000005086879e [
>>>>>>>>>> 449.794163]
>>>>>>>>>>>>>>> amdgpu
>>>>>>>>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>>> [  449.794175] Emily:amdgpu_job_free_cb,Process
>>>>>>>>>> information:
>>>>>>>>>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>>> 449.794221]
>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>> thread pid 0, s_job:0000000066eb74ab [  449.794222]
>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000d4438ad9 [  449.794255]
>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000b6d69c65 [  449.794257]
>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>> thread pid 0,
>>>>>>>>>>>>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>> information: process pid 0 thread  pid 0,
>>>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>>>>>>>>> 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>>>> dereference
>>>>>>>>>>>>>>> at
>>>>>>>>>>>>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0
>> [  449.801040]
>>>>>>>>>> Oops:
>>>>>>>>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>>>>>>>>> [  449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted:
>>>>>>>>>> G OE
>>>>>>>>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>>>>>>>>> [  449.802157] Hardware name: QEMU Standard PC (i440FX
>> +
>>>>>>>>>> PIIX,
>>>>>>>>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [
>>>>>>>>>>>>>>>> 449.802944]
>>>>>>>>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched]
>> [
>>>>>>>>>>>>>>>> 449.803488]
>>>>>>>>>>>>> RIP:
>>>>>>>>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>>>>>>>>> [  449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85
>>>>>>>>>> 56 ff ff
>>>>>>>>>>>>>>>> ff
>>>>>>>>>>>>>>>> 45 85 e4 0f
>>>>>>>>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48
>>>>>>>>>> 8b 40 10
>>>>>>>>>>>>>>> <48> 8b
>>>>>>>>>>>>> 98
>>>>>>>>>>>>>>> c0 00         00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43
>>>>>>>>>> 48 a8 01
>>>>>>>>>>>>>>>> [  449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS:
>>>>>>>>>> 00010286 [
>>>>>>>>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX:
>> 0000000000000000
>>>>>>>>>> RCX:
>>>>>>>>>>>>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0
>> RSI:
>>>>>>>>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [  449.807224]
>> RBP:
>>>>>>>>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>> 449.807818] R10: 0000000000000000 R11:
>> 0000000000000148
>>>>>>>>>> R12:
>>>>>>>>>>>>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0
>> R14:
>>>>>>>>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [  449.809004] FS:
>>>>>>>>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>>>>>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000
>>>>>>>>>> ES: 0000
>>>>>>>>>>> CR0:
>>>>>>>>>>>>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0
>> CR3:
>>>>>>>>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [  449.810747]
>> DR0:
>>>>>>>>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>> 449.811344] DR3: 0000000000000000 DR6:
>> 00000000fffe0ff0 DR7:
>>>>>>>>>>>>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>>>>>>>>>> [  449.812206] amdgpu_job_timedout+0x114/0x140
>> [amdgpu]
>>>>>>>>>>>>>>>> [ 449.812635] drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>>> [amd_sched] [ 449.813139]  ?
>>>>>>>>>>>>>>>> amdgpu_cgs_destroy_device+0x10/0x10
>>>>>>>>>> [amdgpu] [
>>>>>>>>>>>>>>>> 449.813609]  ? drm_sched_job_timedout+0x44/0x90
>>>>>>>>>> [amd_sched] [
>>>>>>>>>>>>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [  449.814417]
>>>>>>>>>>>>>>>> worker_thread+0x34/0x410 [ 449.814728]
>>>>>>>>>> kthread+0x121/0x140 [
>>>>>>>>>>>>>>>> 449.815004]  ? process_one_work+0x3f0/0x3f0 [
>>>>>>>>>> 449.815374]  ?
>>>>>>>>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>>>>>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> I can't follow how it would be possible for
>>>>>>>>>> job->s_fence to be
>>>>>>>>>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> So it looks like this patch is just papering over some
>>>>>>>>>> bigger issues.
>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>>>>>>>>> You need to give me at least enough time to wake up
>>>>>>>>>>>>>>>>>>> :)
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>> From: amd-gfx
>>>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>>>>>>> Behalf
>>>>>>>>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>;
>>>>>>>>>>>>>>>>>>>>> amd- gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>> From: Christian König
>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>>>>>>>>> When the job is already signaled, the s_fence is
>>>>>>>>>> freed.
>>>>>>>>>>>>>>>>>>>>>>> Then it will has null pointer in
>>>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job
>>>>>>>>>> is destroyed.
>>>>>>>>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup.
>>>>>>>>>> But in one
>>>>>>>>>>>>>>>>>>>>> case, when it enter into the
>>>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>>>>>>>>>>>>>>> already in drm_sched_job_cleanup, and at this time,
>>>>>>>>>> it will
>>>>>>>>>>>>>>>>>>>>> go to free
>>>>>>>>>>>>>>> job.
>>>>>>>>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is
>>>>>>>>>> faster. At
>>>>>>>>>>>>>>>>>>>>> that time, job is not freed, but s_fence is already
>>>>>>>>>> NULL.
>>>>>>>>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>>>>>>>> kfree(job);
>>>>>>>>>>>>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>>> reference
>>>>>>>>>>>>>>>>>>> to the
>>>>>>>>>>>>>>> s_fence.
>>>>>>>>>>>>>>>>>>> So you are just papering over a much bigger problem
>>>>>>>>>> here. This
>>>>>>>>>>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> When you see a job without an s_fence then that
>>>>>>>>>> means the
>>>>>>>>>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> |  2
>>>>>>>>>>>>>>>>>>>>>>> +- drivers/gpu/drm/scheduler/sched_main.c     |
>>>>>>>>>>>>>>>>>>>>>>> 11
>>>>>>>>>> ++++++---
>>>>>>>>>>> --
>>>>>>>>>>>>>>>>>>>>>>>         2 files changed, 7 insertions(+), 6
>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>>>>>>>>> ---
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>> +++
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>              *
>>>>>>>>>>>>>>>>>>>>>>>              * job->base holds a reference to
>>>>>>>>>> parent fence
>>>>>>>>>>>>>>>>>>>>>>>              */
>>>>>>>>>>>>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>>>>>>>>>>> &&
>>>>>>>>>>>>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence-
>>> parent))
>>>>>>>>>>>>>>>>>>>>>>> job_signaled = true;
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>>>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(entity, tmp,
>>>>>>>>>>>>> &rq-
>>>>>>>>>>>>>>>>>> entities,
>>>>>>>>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>>>>>>>>> -                          if
>>>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>>>>>>>>>>> ==
>>>>>>>>>>>>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>>>>>>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>>>>>>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>>>>>>>> bad->sched-
>>>>>>>>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>> (entity-
>>>>>>>>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>              * This iteration is thread safe as
>>>>>>>>>> sched thread
>>>>>>>>>>>>>>>>>>>>>>> is
>>>>>>>>>>>>> stopped.
>>>>>>>>>>>>>>>>>>>>>>>              */
>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>> +          if (s_job->s_fence &&
>>>>>>>>>> s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>>>>>>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>>>>>>> @@ -
>>>>>>>>>>>>>>>>> 395,7
>>>>>>>>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>> * Job is still alive so fence refcount at
>>>>>>>>>>>>> least 1
>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>> +                  if (s_job->s_fence)
>>>>>>>>>>>>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>>>>>>>>>>>> finished,
>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>> /*
>>>>>>>>>>>>>>>>>>>>>>> * We must keep bad job alive for later
>>>>>>>>>>>>> use
>>>>>>>>>>>>>>>>> during @@
>>>>>>>>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>> *sched,
>>>>>>>>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>>>>>>>>              * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>>>>>>>>              */
>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>>>>>>> -          struct dma_fence *fence =
>>>>>>>>>> s_job->s_fence->parent;
>>>>>>>>>>>>>>>>>>>>>>> +          struct dma_fence *fence =
>>>>>>>>>> s_job->s_fence ?
>>>>>>>>>>>>>>>>>>>>>>> + s_job-
>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>>>>>>>
>> _______________________________________________
>>>>>>>>>>>>>>>>>>>>> amd-gfx mailing list amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>>>>> _______________________________________________
>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>> _______________________________________________
>>>>>>>> amd-gfx mailing list
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-18 16:16                                                                                                     ` Christian König
  0 siblings, 0 replies; 80+ messages in thread
From: Christian König @ 2019-11-18 16:16 UTC (permalink / raw)
  To: Andrey Grodzovsky, Deng, Emily, Koenig, Christian,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Andrey,

the only thing which doesn't looks so good is the switch to 
list_empty_careful in drm_sched_cleanup_jobs.

We either take the lock here or we don't, but please not that extra 
checking.

Christian.

Am 18.11.19 um 15:07 schrieb Andrey Grodzovsky:
> Thanks Emily.
>
> Christan - ping for review.
>
> Andrey
>
> On 11/14/19 11:39 PM, Deng, Emily wrote:
>> Hi Andrey,
>>       Currently, I am busying with another issue, maybe will try next 
>> week.
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>> Sent: Friday, November 15, 2019 6:14 AM
>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Attached.
>>>
>>> Emily - can you give it a try ?
>>>
>>> Andrey
>>>
>>> On 11/14/19 3:12 AM, Christian König wrote:
>>>>> What about instead of peeking at the job to actually remove it from
>>>>> ring_mirror_list right there,
>>>> Also an interesting idea. We would need to protect the mirror list
>>>> with a lock again, but that should be the lesser evil.
>>>>
>>>> Maybe prototype that and see if it works or not.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>>>>
>>>>> On 11/13/19 9:20 AM, Christian König wrote:
>>>>>> Another more fundamental question: Could we get rid of the timeout
>>>>>> job at all?
>>>>>
>>>>> There are other stuff there besides picking the first unfinished job
>>>>> which is common for all the drivers - such as freeing guilty signaled
>>>>> job and rearming the timeout work timer.
>>>>>
>>>>>
>>>>>> I mean we used to give this as parameter to the scheduler callback
>>>>>> because we had the timeout worker in the job, but that is no longer
>>>>>> the case.
>>>>>>
>>>>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>>>>          job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>>                                         struct drm_sched_job, 
>>>>>>> node);
>>>>>> Why don't we just remove that here and only get the first job after
>>>>>> we have stopped the scheduler?
>>>>>
>>>>> Should be ok since we have the extra check for __kthread_should_park
>>>>> in drm_sched_cleanup_jobs which will protect us in this case from a
>>>>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs
>>>>> after we already parked it. The problem here is we need the
>>>>> drm_sched_job to access the private data for each client driver (see
>>>>> amdgpu_job_timedout for example). What about instead of peeking at
>>>>> the job to actually remove it from ring_mirror_list right there, go
>>>>> ahead with it through the reset routine, if it's signaled in the
>>>>> meanwhile that great - release it, otherwise put it back into
>>>>> ring_mirror_list in drm_sched_resubmit_jobs.
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>>>> This why I asked for a trace with timer enabled, but since there is
>>>>>>> a finite number of places we touch the timer Emily can just put
>>>>>>> prints there. Also, I wonder if this temp fix helps her with the
>>>>>>> issue or not.
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>>>>> The question is where do we rearm the timer for this problem to
>>>>>>>> occur?
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>>>> I was able to reproduce the crash by using the attached
>>>>>>>>> simulate_crash.patch - waiting on guilty job to signal in reset
>>>>>>>>> work and artificially rearming the timeout timer just before the
>>>>>>>>> check for !cancel_delayed_work(&sched->work_tdr)  in
>>>>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I
>>>>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>>>>
>>>>>>>>> basic_fix.patch handles this by testing whether another timer
>>>>>>>>> already armed ob this scheduler or is there a timeout work in
>>>>>>>>> execution right now (see documentation for work_busy) - obviously
>>>>>>>>> this is not a full solution as this will not protect from races
>>>>>>>>> if for example there is immediate work scheduling such as in
>>>>>>>>> drm_sched_fault -  so we probably need to account for this by
>>>>>>>>> making drm_sched_cleanup_jobs (at least in the part where it
>>>>>>>>> iterates ring mirror list and frees jobs) and GPU reset really
>>>>>>>>> mutually exclusive and not like now.
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>>>>> Hi Emily,
>>>>>>>>>>
>>>>>>>>>> you need to print which scheduler instance is freeing the jobs
>>>>>>>>>> and which one is triggering the reset. The TID and PID is
>>>>>>>>>> completely meaningless here since we are called from different
>>>>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>>>>
>>>>>>>>>> Apart from that I will look into this a bit deeper when I have
>>>>>>>>>> time.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>      I add the follow print in function drm_sched_cleanup_jobs.
>>>>>>>>>>>  From the log it shows that only use cancel_delayed_work could
>>>>>>>>>>> not avoid to free job when the sched is in reset. But don’t
>>>>>>>>>>> know exactly where it is wrong about the driver. Do you have
>>>>>>>>>>> any suggestion about this?
>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu,
>>>>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>>>>          /*
>>>>>>>>>>>           * Don't destroy jobs while the timeout worker is
>>>>>>>>>>> running  OR thread
>>>>>>>>>>>           * is being parked and hence assumed to not touch
>>>>>>>>>>> ring_mirror_list
>>>>>>>>>>>           */
>>>>>>>>>>>           if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>>>>                  return;
>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n",
>>>>>>>>>>> current->tgid, current->pid);
>>>>>>>>>>> Best wishes
>>>>>>>>>>> Emily Deng
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>> ring
>>>>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587 Nov 12
>>>>>>>>>>> 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>>>>> Process information: process pid 0 thread pid 0,
>>>>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603 Nov 12 12:58:20
>>>>>>>>>>> ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253,
>>>>>>>>>>> pid:2253
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>>>>>> Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>> Thinking more about this claim - we assume here that if
>>>>>>>>>>> cancel_delayed_work
>>>>>>>>>>>> returned true it guarantees that timeout work is not running
>>>>>>>>>>> but, it merely
>>>>>>>>>>>> means there was a pending timeout work which was removed
>>> from
>>>>>>>>>>>> the workqueue before it's timer elapsed and so it didn't have
>>>>>>>>>>>> a
>>>>>>>>>>> chance to be
>>>>>>>>>>>> dequeued and executed, it doesn't cover already executing
>>>>>>>>>>> work. So there is a
>>>>>>>>>>>> possibility where while timeout work started executing another
>>>>>>>>>>> timeout work
>>>>>>>>>>>> already got enqueued (maybe through earlier cleanup jobs or
>>>>>>>>>>> through
>>>>>>>>>>>> drm_sched_fault) and if at this point another
>>>>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>>>>>> cancel_delayed_work(&sched->work_tdr) will return true even
>>>>>>>>>>> while there is a
>>>>>>>>>>>> timeout job in progress.
>>>>>>>>>>>> Unfortunately we cannot change cancel_delayed_work to
>>>>>>>>>>>> cancel_delayed_work_sync to flush the timeout work as timeout
>>>>>>>>>>> work itself
>>>>>>>>>>>> waits for schedule thread  to be parked again when calling
>>>>>>>>>>> park_thread.
>>>>>>>>>>>> Andrey
>>>>>>>>>>>>
>>>>>>>>>>>> ________________________________________
>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on
>>>>>>>>>>> behalf of
>>>>>>>>>>>> Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>> Sent: 08 November 2019 05:35:18
>>>>>>>>>>>> To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>
>>>>>>>>>>>> exactly that can't happen. See here:
>>>>>>>>>>>>
>>>>>>>>>>>>>           /* Don't destroy jobs while the timeout worker is
>>>>>>>>>>> running */
>>>>>>>>>>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>>>>>>>                   return NULL;
>>>>>>>>>>>> We never free jobs while the timeout working is running to
>>>>>>>>>>> prevent exactly
>>>>>>>>>>>> that issue.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>         The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>>>>>>>>>> amdgpu_device_gpu_recover. I mean the main scheduler free the
>>>>>>>>>>> jobs while
>>>>>>>>>>>> in amdgpu_device_gpu_recover, and before calling
>>> drm_sched_stop.
>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> well who is calling amdgpu_device_gpu_recover() in this 
>>>>>>>>>>>>>> case?
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> When it's not the scheduler we shouldn't have a guilty job
>>>>>>>>>>> in the first place.
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>>>>>>>>> Hi Chrisitan,
>>>>>>>>>>>>>>>          No, I am with the new branch and also has the
>>>>>>>>>>> patch. Even it
>>>>>>>>>>>>>>> are freed by
>>>>>>>>>>>>>> main scheduler, how we could avoid main scheduler to free
>>>>>>>>>>> jobs while
>>>>>>>>>>>>>> enter to function amdgpu_device_gpu_recover?
>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> in this case you are on an old code branch.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Jobs are freed now by the main scheduler thread and only
>>>>>>>>>>> if no
>>>>>>>>>>>>>>>> timeout handler is running.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> See this patch here:
>>>>>>>>>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>>           Please refer to follow log, when it enter to
>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>>>>>>>>>> function, the bad job 000000005086879e is freeing in
>>>>>>>>>>> function
>>>>>>>>>>>>>>>> amdgpu_job_free_cb at the same time, because of the
>>>>>>>>>>> hardware fence
>>>>>>>>>>>>>> signal.
>>>>>>>>>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this case,
>>>>>>>>>>>>>>>> the s_fence is already freed, but job is not freed in 
>>>>>>>>>>>>>>>> time.
>>>>>>>>>>> Then this issue
>>>>>>>>>>>> occurs.
>>>>>>>>>>>>>>>>> [ 449.792189] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>>>>> *ERROR* ring
>>>>>>>>>>>>>> sdma0
>>>>>>>>>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [
>>>>>>>>>>>>>>>>> 449.793202] [drm:amdgpu_job_timedout [amdgpu]]
>>> *ERROR*
>>>>>>>>>>>>>>>>> Process
>>>>>>>>>>> information:
>>>>>>>>>>>>>>>> process  pid 0 thread pid 0, s_job:000000005086879e [
>>>>>>>>>>> 449.794163]
>>>>>>>>>>>>>>>> amdgpu
>>>>>>>>>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>>>> [ 449.794175] Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>>>> 449.794221]
>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>> thread pid 0, s_job:0000000066eb74ab [ 449.794222]
>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000d4438ad9 [ 449.794255]
>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000b6d69c65 [ 449.794257]
>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>> thread pid 0,
>>>>>>>>>>>>>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>>> information: process pid 0 thread pid 0,
>>>>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>>>>>>>>>> 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>>>>> dereference
>>>>>>>>>>>>>>>> at
>>>>>>>>>>>>>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0
>>> [  449.801040]
>>>>>>>>>>> Oops:
>>>>>>>>>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>>>>>>>>>> [ 449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted:
>>>>>>>>>>> G OE
>>>>>>>>>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>>>>>>>>>> [ 449.802157] Hardware name: QEMU Standard PC (i440FX
>>> +
>>>>>>>>>>> PIIX,
>>>>>>>>>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [
>>>>>>>>>>>>>>>>> 449.802944]
>>>>>>>>>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched]
>>> [
>>>>>>>>>>>>>>>>> 449.803488]
>>>>>>>>>>>>>> RIP:
>>>>>>>>>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>>>>>>>>>> [ 449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85
>>>>>>>>>>> 56 ff ff
>>>>>>>>>>>>>>>>> ff
>>>>>>>>>>>>>>>>> 45 85 e4 0f
>>>>>>>>>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48
>>>>>>>>>>> 8b 40 10
>>>>>>>>>>>>>>>> <48> 8b
>>>>>>>>>>>>>> 98
>>>>>>>>>>>>>>>> c0 00 00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43
>>>>>>>>>>> 48 a8 01
>>>>>>>>>>>>>>>>> [ 449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS:
>>>>>>>>>>> 00010286 [
>>>>>>>>>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX:
>>> 0000000000000000
>>>>>>>>>>> RCX:
>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0
>>> RSI:
>>>>>>>>>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [ 449.807224]
>>> RBP:
>>>>>>>>>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>> 449.807818] R10: 0000000000000000 R11:
>>> 0000000000000148
>>>>>>>>>>> R12:
>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0
>>> R14:
>>>>>>>>>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [ 449.809004] FS:
>>>>>>>>>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>>>>>>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000
>>>>>>>>>>> ES: 0000
>>>>>>>>>>>> CR0:
>>>>>>>>>>>>>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0
>>> CR3:
>>>>>>>>>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [ 449.810747]
>>> DR0:
>>>>>>>>>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>> 449.811344] DR3: 0000000000000000 DR6:
>>> 00000000fffe0ff0 DR7:
>>>>>>>>>>>>>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>>>>>>>>>>> [ 449.812206] amdgpu_job_timedout+0x114/0x140
>>> [amdgpu]
>>>>>>>>>>>>>>>>> [ 449.812635] drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>>>> [amd_sched] [ 449.813139]  ?
>>>>>>>>>>>>>>>>> amdgpu_cgs_destroy_device+0x10/0x10
>>>>>>>>>>> [amdgpu] [
>>>>>>>>>>>>>>>>> 449.813609] ? drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>> [amd_sched] [
>>>>>>>>>>>>>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [ 449.814417]
>>>>>>>>>>>>>>>>> worker_thread+0x34/0x410 [ 449.814728]
>>>>>>>>>>> kthread+0x121/0x140 [
>>>>>>>>>>>>>>>>> 449.815004] ? process_one_work+0x3f0/0x3f0 [
>>>>>>>>>>> 449.815374]  ?
>>>>>>>>>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>>>>>>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> I can't follow how it would be possible for
>>>>>>>>>>> job->s_fence to be
>>>>>>>>>>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> So it looks like this patch is just papering over some
>>>>>>>>>>> bigger issues.
>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>>>>>>>>>> You need to give me at least enough time to wake up
>>>>>>>>>>>>>>>>>>>> :)
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>> From: amd-gfx
>>>>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>>>>>>>> Behalf
>>>>>>>>>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>;
>>>>>>>>>>>>>>>>>>>>>> amd- gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>> From: Christian König
>>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>>>>>>>>>> When the job is already signaled, the s_fence is
>>>>>>>>>>> freed.
>>>>>>>>>>>>>>>>>>>>>>>> Then it will has null pointer in
>>>>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job
>>>>>>>>>>> is destroyed.
>>>>>>>>>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup.
>>>>>>>>>>> But in one
>>>>>>>>>>>>>>>>>>>>>> case, when it enter into the
>>>>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>>>>>>>>>>>>>>>> already in drm_sched_job_cleanup, and at this time,
>>>>>>>>>>> it will
>>>>>>>>>>>>>>>>>>>>>> go to free
>>>>>>>>>>>>>>>> job.
>>>>>>>>>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is
>>>>>>>>>>> faster. At
>>>>>>>>>>>>>>>>>>>>>> that time, job is not freed, but s_fence is already
>>>>>>>>>>> NULL.
>>>>>>>>>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>>>>>>>>> kfree(job);
>>>>>>>>>>>>>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>>>> reference
>>>>>>>>>>>>>>>>>>>> to the
>>>>>>>>>>>>>>>> s_fence.
>>>>>>>>>>>>>>>>>>>> So you are just papering over a much bigger problem
>>>>>>>>>>> here. This
>>>>>>>>>>>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> When you see a job without an s_fence then that
>>>>>>>>>>> means the
>>>>>>>>>>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> |  2
>>>>>>>>>>>>>>>>>>>>>>>> +- drivers/gpu/drm/scheduler/sched_main.c |
>>>>>>>>>>>>>>>>>>>>>>>> 11
>>>>>>>>>>> ++++++---
>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>>>>>>>>>>> 2 files changed, 7 insertions(+), 6
>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>>>>>>>>>> ---
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> +++
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>              * job->base holds a reference to
>>>>>>>>>>> parent fence
>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>>>>>>>>>>>> &&
>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence-
>>>> parent))
>>>>>>>>>>>>>>>>>>>>>>>> job_signaled = true;
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>>>>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(entity, tmp,
>>>>>>>>>>>>>> &rq-
>>>>>>>>>>>>>>>>>>> entities,
>>>>>>>>>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>>>>>>>>>> - if
>>>>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>>>>>>>>>>>> ==
>>>>>>>>>>>>>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>>>>>>>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>>>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>>>>>>>>> bad->sched-
>>>>>>>>>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>> (entity-
>>>>>>>>>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>> * This iteration is thread safe as
>>>>>>>>>>> sched thread
>>>>>>>>>>>>>>>>>>>>>>>> is
>>>>>>>>>>>>>> stopped.
>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>> +          if (s_job->s_fence &&
>>>>>>>>>>> s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>>>>>>>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>>>>>>>> @@ -
>>>>>>>>>>>>>>>>>> 395,7
>>>>>>>>>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>> * Job is still alive so fence refcount at
>>>>>>>>>>>>>> least 1
>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>> + if (s_job->s_fence)
>>>>>>>>>>>>>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>>>>>>>>>>>>> finished,
>>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>> /*
>>>>>>>>>>>>>>>>>>>>>>>> * We must keep bad job alive for later
>>>>>>>>>>>>>> use
>>>>>>>>>>>>>>>>>> during @@
>>>>>>>>>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>> *sched,
>>>>>>>>>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>>>>>>>>> * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>>>>>>>>>              */
>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>>>>>>>> - struct dma_fence *fence =
>>>>>>>>>>> s_job->s_fence->parent;
>>>>>>>>>>>>>>>>>>>>>>>> + struct dma_fence *fence =
>>>>>>>>>>> s_job->s_fence ?
>>>>>>>>>>>>>>>>>>>>>>>> + s_job-
>>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>>>>>>>>
>>> _______________________________________________
>>>>>>>>>>>>>>>>>>>>>> amd-gfx mailing list amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>> _______________________________________________
>>>>>>>>> amd-gfx mailing list
>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-18 16:16                                                                                                     ` Christian König
  0 siblings, 0 replies; 80+ messages in thread
From: Christian König @ 2019-11-18 16:16 UTC (permalink / raw)
  To: Andrey Grodzovsky, Deng, Emily, Koenig, Christian, amd-gfx

Hi Andrey,

the only thing which doesn't looks so good is the switch to 
list_empty_careful in drm_sched_cleanup_jobs.

We either take the lock here or we don't, but please not that extra 
checking.

Christian.

Am 18.11.19 um 15:07 schrieb Andrey Grodzovsky:
> Thanks Emily.
>
> Christan - ping for review.
>
> Andrey
>
> On 11/14/19 11:39 PM, Deng, Emily wrote:
>> Hi Andrey,
>>       Currently, I am busying with another issue, maybe will try next 
>> week.
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>> Sent: Friday, November 15, 2019 6:14 AM
>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>
>>> Attached.
>>>
>>> Emily - can you give it a try ?
>>>
>>> Andrey
>>>
>>> On 11/14/19 3:12 AM, Christian König wrote:
>>>>> What about instead of peeking at the job to actually remove it from
>>>>> ring_mirror_list right there,
>>>> Also an interesting idea. We would need to protect the mirror list
>>>> with a lock again, but that should be the lesser evil.
>>>>
>>>> Maybe prototype that and see if it works or not.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>>>>
>>>>> On 11/13/19 9:20 AM, Christian König wrote:
>>>>>> Another more fundamental question: Could we get rid of the timeout
>>>>>> job at all?
>>>>>
>>>>> There are other stuff there besides picking the first unfinished job
>>>>> which is common for all the drivers - such as freeing guilty signaled
>>>>> job and rearming the timeout work timer.
>>>>>
>>>>>
>>>>>> I mean we used to give this as parameter to the scheduler callback
>>>>>> because we had the timeout worker in the job, but that is no longer
>>>>>> the case.
>>>>>>
>>>>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>>>>          job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>>                                         struct drm_sched_job, 
>>>>>>> node);
>>>>>> Why don't we just remove that here and only get the first job after
>>>>>> we have stopped the scheduler?
>>>>>
>>>>> Should be ok since we have the extra check for __kthread_should_park
>>>>> in drm_sched_cleanup_jobs which will protect us in this case from a
>>>>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs
>>>>> after we already parked it. The problem here is we need the
>>>>> drm_sched_job to access the private data for each client driver (see
>>>>> amdgpu_job_timedout for example). What about instead of peeking at
>>>>> the job to actually remove it from ring_mirror_list right there, go
>>>>> ahead with it through the reset routine, if it's signaled in the
>>>>> meanwhile that great - release it, otherwise put it back into
>>>>> ring_mirror_list in drm_sched_resubmit_jobs.
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>>>> This why I asked for a trace with timer enabled, but since there is
>>>>>>> a finite number of places we touch the timer Emily can just put
>>>>>>> prints there. Also, I wonder if this temp fix helps her with the
>>>>>>> issue or not.
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>>>>> The question is where do we rearm the timer for this problem to
>>>>>>>> occur?
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>>>> I was able to reproduce the crash by using the attached
>>>>>>>>> simulate_crash.patch - waiting on guilty job to signal in reset
>>>>>>>>> work and artificially rearming the timeout timer just before the
>>>>>>>>> check for !cancel_delayed_work(&sched->work_tdr)  in
>>>>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I
>>>>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>>>>
>>>>>>>>> basic_fix.patch handles this by testing whether another timer
>>>>>>>>> already armed ob this scheduler or is there a timeout work in
>>>>>>>>> execution right now (see documentation for work_busy) - obviously
>>>>>>>>> this is not a full solution as this will not protect from races
>>>>>>>>> if for example there is immediate work scheduling such as in
>>>>>>>>> drm_sched_fault -  so we probably need to account for this by
>>>>>>>>> making drm_sched_cleanup_jobs (at least in the part where it
>>>>>>>>> iterates ring mirror list and frees jobs) and GPU reset really
>>>>>>>>> mutually exclusive and not like now.
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>>>>> Hi Emily,
>>>>>>>>>>
>>>>>>>>>> you need to print which scheduler instance is freeing the jobs
>>>>>>>>>> and which one is triggering the reset. The TID and PID is
>>>>>>>>>> completely meaningless here since we are called from different
>>>>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>>>>
>>>>>>>>>> Apart from that I will look into this a bit deeper when I have
>>>>>>>>>> time.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>      I add the follow print in function drm_sched_cleanup_jobs.
>>>>>>>>>>>  From the log it shows that only use cancel_delayed_work could
>>>>>>>>>>> not avoid to free job when the sched is in reset. But don’t
>>>>>>>>>>> know exactly where it is wrong about the driver. Do you have
>>>>>>>>>>> any suggestion about this?
>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu,
>>>>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>>>>          /*
>>>>>>>>>>>           * Don't destroy jobs while the timeout worker is
>>>>>>>>>>> running  OR thread
>>>>>>>>>>>           * is being parked and hence assumed to not touch
>>>>>>>>>>> ring_mirror_list
>>>>>>>>>>>           */
>>>>>>>>>>>           if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>>>>                  return;
>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n",
>>>>>>>>>>> current->tgid, current->pid);
>>>>>>>>>>> Best wishes
>>>>>>>>>>> Emily Deng
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>> ring
>>>>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587 Nov 12
>>>>>>>>>>> 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>>>>> Process information: process pid 0 thread pid 0,
>>>>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603 Nov 12 12:58:20
>>>>>>>>>>> ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253,
>>>>>>>>>>> pid:2253
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, tid:2262,
>>>>>>>>>>> pid:2262
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>>>>>> Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>> Thinking more about this claim - we assume here that if
>>>>>>>>>>> cancel_delayed_work
>>>>>>>>>>>> returned true it guarantees that timeout work is not running
>>>>>>>>>>> but, it merely
>>>>>>>>>>>> means there was a pending timeout work which was removed
>>> from
>>>>>>>>>>>> the workqueue before it's timer elapsed and so it didn't have
>>>>>>>>>>>> a
>>>>>>>>>>> chance to be
>>>>>>>>>>>> dequeued and executed, it doesn't cover already executing
>>>>>>>>>>> work. So there is a
>>>>>>>>>>>> possibility where while timeout work started executing another
>>>>>>>>>>> timeout work
>>>>>>>>>>>> already got enqueued (maybe through earlier cleanup jobs or
>>>>>>>>>>> through
>>>>>>>>>>>> drm_sched_fault) and if at this point another
>>>>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>>>>>> cancel_delayed_work(&sched->work_tdr) will return true even
>>>>>>>>>>> while there is a
>>>>>>>>>>>> timeout job in progress.
>>>>>>>>>>>> Unfortunately we cannot change cancel_delayed_work to
>>>>>>>>>>>> cancel_delayed_work_sync to flush the timeout work as timeout
>>>>>>>>>>> work itself
>>>>>>>>>>>> waits for schedule thread  to be parked again when calling
>>>>>>>>>>> park_thread.
>>>>>>>>>>>> Andrey
>>>>>>>>>>>>
>>>>>>>>>>>> ________________________________________
>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on
>>>>>>>>>>> behalf of
>>>>>>>>>>>> Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>> Sent: 08 November 2019 05:35:18
>>>>>>>>>>>> To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>
>>>>>>>>>>>> exactly that can't happen. See here:
>>>>>>>>>>>>
>>>>>>>>>>>>>           /* Don't destroy jobs while the timeout worker is
>>>>>>>>>>> running */
>>>>>>>>>>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>>>>>>>                   return NULL;
>>>>>>>>>>>> We never free jobs while the timeout working is running to
>>>>>>>>>>> prevent exactly
>>>>>>>>>>>> that issue.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>         The drm_sched_job_timedout-> amdgpu_job_timedout call
>>>>>>>>>>>> amdgpu_device_gpu_recover. I mean the main scheduler free the
>>>>>>>>>>> jobs while
>>>>>>>>>>>> in amdgpu_device_gpu_recover, and before calling
>>> drm_sched_stop.
>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>> for tdr
>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> well who is calling amdgpu_device_gpu_recover() in this 
>>>>>>>>>>>>>> case?
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> When it's not the scheduler we shouldn't have a guilty job
>>>>>>>>>>> in the first place.
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>>>>>>>>> Hi Chrisitan,
>>>>>>>>>>>>>>>          No, I am with the new branch and also has the
>>>>>>>>>>> patch. Even it
>>>>>>>>>>>>>>> are freed by
>>>>>>>>>>>>>> main scheduler, how we could avoid main scheduler to free
>>>>>>>>>>> jobs while
>>>>>>>>>>>>>> enter to function amdgpu_device_gpu_recover?
>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> in this case you are on an old code branch.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Jobs are freed now by the main scheduler thread and only
>>>>>>>>>>> if no
>>>>>>>>>>>>>>>> timeout handler is running.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> See this patch here:
>>>>>>>>>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>>           Please refer to follow log, when it enter to
>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>>>>>>>>>> function, the bad job 000000005086879e is freeing in
>>>>>>>>>>> function
>>>>>>>>>>>>>>>> amdgpu_job_free_cb at the same time, because of the
>>>>>>>>>>> hardware fence
>>>>>>>>>>>>>> signal.
>>>>>>>>>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this case,
>>>>>>>>>>>>>>>> the s_fence is already freed, but job is not freed in 
>>>>>>>>>>>>>>>> time.
>>>>>>>>>>> Then this issue
>>>>>>>>>>>> occurs.
>>>>>>>>>>>>>>>>> [ 449.792189] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>>>>> *ERROR* ring
>>>>>>>>>>>>>> sdma0
>>>>>>>>>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [
>>>>>>>>>>>>>>>>> 449.793202] [drm:amdgpu_job_timedout [amdgpu]]
>>> *ERROR*
>>>>>>>>>>>>>>>>> Process
>>>>>>>>>>> information:
>>>>>>>>>>>>>>>> process  pid 0 thread pid 0, s_job:000000005086879e [
>>>>>>>>>>> 449.794163]
>>>>>>>>>>>>>>>> amdgpu
>>>>>>>>>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>>>> [ 449.794175] Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>>>> 449.794221]
>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>> thread pid 0, s_job:0000000066eb74ab [ 449.794222]
>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000d4438ad9 [ 449.794255]
>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000b6d69c65 [ 449.794257]
>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>> thread pid 0,
>>>>>>>>>>>>>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>>> information: process pid 0 thread pid 0,
>>>>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>>>>>>>>>> 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>>>>> dereference
>>>>>>>>>>>>>>>> at
>>>>>>>>>>>>>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0
>>> [  449.801040]
>>>>>>>>>>> Oops:
>>>>>>>>>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>>>>>>>>>> [ 449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted:
>>>>>>>>>>> G OE
>>>>>>>>>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>>>>>>>>>> [ 449.802157] Hardware name: QEMU Standard PC (i440FX
>>> +
>>>>>>>>>>> PIIX,
>>>>>>>>>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [
>>>>>>>>>>>>>>>>> 449.802944]
>>>>>>>>>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched]
>>> [
>>>>>>>>>>>>>>>>> 449.803488]
>>>>>>>>>>>>>> RIP:
>>>>>>>>>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>>>>>>>>>> [ 449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85
>>>>>>>>>>> 56 ff ff
>>>>>>>>>>>>>>>>> ff
>>>>>>>>>>>>>>>>> 45 85 e4 0f
>>>>>>>>>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48
>>>>>>>>>>> 8b 40 10
>>>>>>>>>>>>>>>> <48> 8b
>>>>>>>>>>>>>> 98
>>>>>>>>>>>>>>>> c0 00 00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43
>>>>>>>>>>> 48 a8 01
>>>>>>>>>>>>>>>>> [ 449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS:
>>>>>>>>>>> 00010286 [
>>>>>>>>>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX:
>>> 0000000000000000
>>>>>>>>>>> RCX:
>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0
>>> RSI:
>>>>>>>>>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [ 449.807224]
>>> RBP:
>>>>>>>>>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>> 449.807818] R10: 0000000000000000 R11:
>>> 0000000000000148
>>>>>>>>>>> R12:
>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0
>>> R14:
>>>>>>>>>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [ 449.809004] FS:
>>>>>>>>>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>>>>>>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000
>>>>>>>>>>> ES: 0000
>>>>>>>>>>>> CR0:
>>>>>>>>>>>>>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0
>>> CR3:
>>>>>>>>>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [ 449.810747]
>>> DR0:
>>>>>>>>>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>> 449.811344] DR3: 0000000000000000 DR6:
>>> 00000000fffe0ff0 DR7:
>>>>>>>>>>>>>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>>>>>>>>>>> [ 449.812206] amdgpu_job_timedout+0x114/0x140
>>> [amdgpu]
>>>>>>>>>>>>>>>>> [ 449.812635] drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>>>> [amd_sched] [ 449.813139]  ?
>>>>>>>>>>>>>>>>> amdgpu_cgs_destroy_device+0x10/0x10
>>>>>>>>>>> [amdgpu] [
>>>>>>>>>>>>>>>>> 449.813609] ? drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>> [amd_sched] [
>>>>>>>>>>>>>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [ 449.814417]
>>>>>>>>>>>>>>>>> worker_thread+0x34/0x410 [ 449.814728]
>>>>>>>>>>> kthread+0x121/0x140 [
>>>>>>>>>>>>>>>>> 449.815004] ? process_one_work+0x3f0/0x3f0 [
>>>>>>>>>>> 449.815374]  ?
>>>>>>>>>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>>>>>>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> I can't follow how it would be possible for
>>>>>>>>>>> job->s_fence to be
>>>>>>>>>>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> So it looks like this patch is just papering over some
>>>>>>>>>>> bigger issues.
>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>>>>>>>>>> You need to give me at least enough time to wake up
>>>>>>>>>>>>>>>>>>>> :)
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>> From: amd-gfx
>>>>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>>>>>>>> Behalf
>>>>>>>>>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>;
>>>>>>>>>>>>>>>>>>>>>> amd- gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>> From: Christian König
>>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>>>>>>>>>> When the job is already signaled, the s_fence is
>>>>>>>>>>> freed.
>>>>>>>>>>>>>>>>>>>>>>>> Then it will has null pointer in
>>>>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job
>>>>>>>>>>> is destroyed.
>>>>>>>>>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup.
>>>>>>>>>>> But in one
>>>>>>>>>>>>>>>>>>>>>> case, when it enter into the
>>>>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>>>>>>>>>>>>>>>> already in drm_sched_job_cleanup, and at this time,
>>>>>>>>>>> it will
>>>>>>>>>>>>>>>>>>>>>> go to free
>>>>>>>>>>>>>>>> job.
>>>>>>>>>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is
>>>>>>>>>>> faster. At
>>>>>>>>>>>>>>>>>>>>>> that time, job is not freed, but s_fence is already
>>>>>>>>>>> NULL.
>>>>>>>>>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>>>>>>>>> kfree(job);
>>>>>>>>>>>>>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>>>> reference
>>>>>>>>>>>>>>>>>>>> to the
>>>>>>>>>>>>>>>> s_fence.
>>>>>>>>>>>>>>>>>>>> So you are just papering over a much bigger problem
>>>>>>>>>>> here. This
>>>>>>>>>>>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> When you see a job without an s_fence then that
>>>>>>>>>>> means the
>>>>>>>>>>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> |  2
>>>>>>>>>>>>>>>>>>>>>>>> +- drivers/gpu/drm/scheduler/sched_main.c |
>>>>>>>>>>>>>>>>>>>>>>>> 11
>>>>>>>>>>> ++++++---
>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>>>>>>>>>>> 2 files changed, 7 insertions(+), 6
>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>>>>>>>>>> ---
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> +++
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>              * job->base holds a reference to
>>>>>>>>>>> parent fence
>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>>>>>>>>>>>> &&
>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence-
>>>> parent))
>>>>>>>>>>>>>>>>>>>>>>>> job_signaled = true;
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>>>>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(entity, tmp,
>>>>>>>>>>>>>> &rq-
>>>>>>>>>>>>>>>>>>> entities,
>>>>>>>>>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>>>>>>>>>> - if
>>>>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>>>>>>>>>>>> ==
>>>>>>>>>>>>>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>>>>>>>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>>>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>>>>>>>>> bad->sched-
>>>>>>>>>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>> (entity-
>>>>>>>>>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>> * This iteration is thread safe as
>>>>>>>>>>> sched thread
>>>>>>>>>>>>>>>>>>>>>>>> is
>>>>>>>>>>>>>> stopped.
>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>> +          if (s_job->s_fence &&
>>>>>>>>>>> s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>>>>>>>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>>>>>>>> @@ -
>>>>>>>>>>>>>>>>>> 395,7
>>>>>>>>>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>> * Job is still alive so fence refcount at
>>>>>>>>>>>>>> least 1
>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>> + if (s_job->s_fence)
>>>>>>>>>>>>>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>>>>>>>>>>>>> finished,
>>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>> /*
>>>>>>>>>>>>>>>>>>>>>>>> * We must keep bad job alive for later
>>>>>>>>>>>>>> use
>>>>>>>>>>>>>>>>>> during @@
>>>>>>>>>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>> *sched,
>>>>>>>>>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>>>>>>>>> * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>>>>>>>>>              */
>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>>>>>>>> - struct dma_fence *fence =
>>>>>>>>>>> s_job->s_fence->parent;
>>>>>>>>>>>>>>>>>>>>>>>> + struct dma_fence *fence =
>>>>>>>>>>> s_job->s_fence ?
>>>>>>>>>>>>>>>>>>>>>>>> + s_job-
>>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>>>>>>>>
>>> _______________________________________________
>>>>>>>>>>>>>>>>>>>>>> amd-gfx mailing list amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>> _______________________________________________
>>>>>>>>> amd-gfx mailing list
>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-18 16:23                                                                                                         ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-18 16:23 UTC (permalink / raw)
  To: christian.koenig-5C7GfCeVMHo, Deng, Emily,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Can you explain why ? As I see it - list_empty_careful is specifically 
designed for the case where the only other concurrent operation in 
progress is list_del_init 
(https://www.kernel.org/doc/htmldocs/kernel-api/API-list-empty-careful.html) 
- which is exactly what happens in this patch, no other list altering 
operation can take place concurrently - so it looks safe to use for me.

Andrey

On 11/18/19 11:16 AM, Christian König wrote:
> Hi Andrey,
>
> the only thing which doesn't looks so good is the switch to 
> list_empty_careful in drm_sched_cleanup_jobs.
>
> We either take the lock here or we don't, but please not that extra 
> checking.
>
> Christian.
>
> Am 18.11.19 um 15:07 schrieb Andrey Grodzovsky:
>> Thanks Emily.
>>
>> Christan - ping for review.
>>
>> Andrey
>>
>> On 11/14/19 11:39 PM, Deng, Emily wrote:
>>> Hi Andrey,
>>>       Currently, I am busying with another issue, maybe will try 
>>> next week.
>>>
>>> Best wishes
>>> Emily Deng
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>> Sent: Friday, November 15, 2019 6:14 AM
>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>> Attached.
>>>>
>>>> Emily - can you give it a try ?
>>>>
>>>> Andrey
>>>>
>>>> On 11/14/19 3:12 AM, Christian König wrote:
>>>>>> What about instead of peeking at the job to actually remove it from
>>>>>> ring_mirror_list right there,
>>>>> Also an interesting idea. We would need to protect the mirror list
>>>>> with a lock again, but that should be the lesser evil.
>>>>>
>>>>> Maybe prototype that and see if it works or not.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>>>>>
>>>>>> On 11/13/19 9:20 AM, Christian König wrote:
>>>>>>> Another more fundamental question: Could we get rid of the timeout
>>>>>>> job at all?
>>>>>>
>>>>>> There are other stuff there besides picking the first unfinished job
>>>>>> which is common for all the drivers - such as freeing guilty 
>>>>>> signaled
>>>>>> job and rearming the timeout work timer.
>>>>>>
>>>>>>
>>>>>>> I mean we used to give this as parameter to the scheduler callback
>>>>>>> because we had the timeout worker in the job, but that is no longer
>>>>>>> the case.
>>>>>>>
>>>>>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>>>>>          job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>>>                                         struct drm_sched_job, 
>>>>>>>> node);
>>>>>>> Why don't we just remove that here and only get the first job after
>>>>>>> we have stopped the scheduler?
>>>>>>
>>>>>> Should be ok since we have the extra check for __kthread_should_park
>>>>>> in drm_sched_cleanup_jobs which will protect us in this case from a
>>>>>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs
>>>>>> after we already parked it. The problem here is we need the
>>>>>> drm_sched_job to access the private data for each client driver (see
>>>>>> amdgpu_job_timedout for example). What about instead of peeking at
>>>>>> the job to actually remove it from ring_mirror_list right there, go
>>>>>> ahead with it through the reset routine, if it's signaled in the
>>>>>> meanwhile that great - release it, otherwise put it back into
>>>>>> ring_mirror_list in drm_sched_resubmit_jobs.
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>>>>> This why I asked for a trace with timer enabled, but since 
>>>>>>>> there is
>>>>>>>> a finite number of places we touch the timer Emily can just put
>>>>>>>> prints there. Also, I wonder if this temp fix helps her with the
>>>>>>>> issue or not.
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>>>>>> The question is where do we rearm the timer for this problem to
>>>>>>>>> occur?
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>>>>> I was able to reproduce the crash by using the attached
>>>>>>>>>> simulate_crash.patch - waiting on guilty job to signal in reset
>>>>>>>>>> work and artificially rearming the timeout timer just before the
>>>>>>>>>> check for !cancel_delayed_work(&sched->work_tdr) in
>>>>>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I
>>>>>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>>>>>
>>>>>>>>>> basic_fix.patch handles this by testing whether another timer
>>>>>>>>>> already armed ob this scheduler or is there a timeout work in
>>>>>>>>>> execution right now (see documentation for work_busy) - 
>>>>>>>>>> obviously
>>>>>>>>>> this is not a full solution as this will not protect from races
>>>>>>>>>> if for example there is immediate work scheduling such as in
>>>>>>>>>> drm_sched_fault -  so we probably need to account for this by
>>>>>>>>>> making drm_sched_cleanup_jobs (at least in the part where it
>>>>>>>>>> iterates ring mirror list and frees jobs) and GPU reset really
>>>>>>>>>> mutually exclusive and not like now.
>>>>>>>>>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>
>>>>>>>>>>> you need to print which scheduler instance is freeing the jobs
>>>>>>>>>>> and which one is triggering the reset. The TID and PID is
>>>>>>>>>>> completely meaningless here since we are called from different
>>>>>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>>>>>
>>>>>>>>>>> Apart from that I will look into this a bit deeper when I have
>>>>>>>>>>> time.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>      I add the follow print in function 
>>>>>>>>>>>> drm_sched_cleanup_jobs.
>>>>>>>>>>>>  From the log it shows that only use cancel_delayed_work could
>>>>>>>>>>>> not avoid to free job when the sched is in reset. But don’t
>>>>>>>>>>>> know exactly where it is wrong about the driver. Do you have
>>>>>>>>>>>> any suggestion about this?
>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu,
>>>>>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>>>>>          /*
>>>>>>>>>>>>           * Don't destroy jobs while the timeout worker is
>>>>>>>>>>>> running  OR thread
>>>>>>>>>>>>           * is being parked and hence assumed to not touch
>>>>>>>>>>>> ring_mirror_list
>>>>>>>>>>>>           */
>>>>>>>>>>>>           if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>>>>>                  return;
>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n",
>>>>>>>>>>>> current->tgid, current->pid);
>>>>>>>>>>>> Best wishes
>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>> ring
>>>>>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587 Nov 12
>>>>>>>>>>>> 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>>>>>> Process information: process pid 0 thread pid 0,
>>>>>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603 Nov 12 12:58:20
>>>>>>>>>>>> ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, 
>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253,
>>>>>>>>>>>> pid:2253
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, 
>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>>>>>>> Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>>>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>> Thinking more about this claim - we assume here that if
>>>>>>>>>>>> cancel_delayed_work
>>>>>>>>>>>>> returned true it guarantees that timeout work is not running
>>>>>>>>>>>> but, it merely
>>>>>>>>>>>>> means there was a pending timeout work which was removed
>>>> from
>>>>>>>>>>>>> the workqueue before it's timer elapsed and so it didn't have
>>>>>>>>>>>>> a
>>>>>>>>>>>> chance to be
>>>>>>>>>>>>> dequeued and executed, it doesn't cover already executing
>>>>>>>>>>>> work. So there is a
>>>>>>>>>>>>> possibility where while timeout work started executing 
>>>>>>>>>>>>> another
>>>>>>>>>>>> timeout work
>>>>>>>>>>>>> already got enqueued (maybe through earlier cleanup jobs or
>>>>>>>>>>>> through
>>>>>>>>>>>>> drm_sched_fault) and if at this point another
>>>>>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>>>>>>> cancel_delayed_work(&sched->work_tdr) will return true even
>>>>>>>>>>>> while there is a
>>>>>>>>>>>>> timeout job in progress.
>>>>>>>>>>>>> Unfortunately we cannot change cancel_delayed_work to
>>>>>>>>>>>>> cancel_delayed_work_sync to flush the timeout work as timeout
>>>>>>>>>>>> work itself
>>>>>>>>>>>>> waits for schedule thread  to be parked again when calling
>>>>>>>>>>>> park_thread.
>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>
>>>>>>>>>>>>> ________________________________________
>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on
>>>>>>>>>>>> behalf of
>>>>>>>>>>>>> Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>> Sent: 08 November 2019 05:35:18
>>>>>>>>>>>>> To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>
>>>>>>>>>>>>> exactly that can't happen. See here:
>>>>>>>>>>>>>
>>>>>>>>>>>>>>           /* Don't destroy jobs while the timeout worker is
>>>>>>>>>>>> running */
>>>>>>>>>>>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>>>>>>>>                   return NULL;
>>>>>>>>>>>>> We never free jobs while the timeout working is running to
>>>>>>>>>>>> prevent exactly
>>>>>>>>>>>>> that issue.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>         The drm_sched_job_timedout-> amdgpu_job_timedout 
>>>>>>>>>>>>>> call
>>>>>>>>>>>>> amdgpu_device_gpu_recover. I mean the main scheduler free the
>>>>>>>>>>>> jobs while
>>>>>>>>>>>>> in amdgpu_device_gpu_recover, and before calling
>>>> drm_sched_stop.
>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> well who is calling amdgpu_device_gpu_recover() in this 
>>>>>>>>>>>>>>> case?
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> When it's not the scheduler we shouldn't have a guilty job
>>>>>>>>>>>> in the first place.
>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>>>>>>>>>> Hi Chrisitan,
>>>>>>>>>>>>>>>>          No, I am with the new branch and also has the
>>>>>>>>>>>> patch. Even it
>>>>>>>>>>>>>>>> are freed by
>>>>>>>>>>>>>>> main scheduler, how we could avoid main scheduler to free
>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>>> enter to function amdgpu_device_gpu_recover?
>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> in this case you are on an old code branch.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Jobs are freed now by the main scheduler thread and only
>>>>>>>>>>>> if no
>>>>>>>>>>>>>>>>> timeout handler is running.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> See this patch here:
>>>>>>>>>>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>>>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>>>           Please refer to follow log, when it enter to
>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>>>>>>>>>>> function, the bad job 000000005086879e is freeing in
>>>>>>>>>>>> function
>>>>>>>>>>>>>>>>> amdgpu_job_free_cb at the same time, because of the
>>>>>>>>>>>> hardware fence
>>>>>>>>>>>>>>> signal.
>>>>>>>>>>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this case,
>>>>>>>>>>>>>>>>> the s_fence is already freed, but job is not freed in 
>>>>>>>>>>>>>>>>> time.
>>>>>>>>>>>> Then this issue
>>>>>>>>>>>>> occurs.
>>>>>>>>>>>>>>>>>> [ 449.792189] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>>>>>> *ERROR* ring
>>>>>>>>>>>>>>> sdma0
>>>>>>>>>>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [
>>>>>>>>>>>>>>>>>> 449.793202] [drm:amdgpu_job_timedout [amdgpu]]
>>>> *ERROR*
>>>>>>>>>>>>>>>>>> Process
>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>> process  pid 0 thread pid 0, s_job:000000005086879e [
>>>>>>>>>>>> 449.794163]
>>>>>>>>>>>>>>>>> amdgpu
>>>>>>>>>>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>>>>> [ 449.794175] Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>>>>> 449.794221]
>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>> thread pid 0, s_job:0000000066eb74ab [ 449.794222]
>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000d4438ad9 [ 449.794255]
>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000b6d69c65 [ 449.794257]
>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>> thread pid 0,
>>>>>>>>>>>>>>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>>>> information: process pid 0 thread pid 0,
>>>>>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>>>>>>>>>>> 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>>>>>> dereference
>>>>>>>>>>>>>>>>> at
>>>>>>>>>>>>>>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0
>>>> [  449.801040]
>>>>>>>>>>>> Oops:
>>>>>>>>>>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>>>>>>>>>>> [ 449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted:
>>>>>>>>>>>> G OE
>>>>>>>>>>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>>>>>>>>>>> [ 449.802157] Hardware name: QEMU Standard PC (i440FX
>>>> +
>>>>>>>>>>>> PIIX,
>>>>>>>>>>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [
>>>>>>>>>>>>>>>>>> 449.802944]
>>>>>>>>>>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched]
>>>> [
>>>>>>>>>>>>>>>>>> 449.803488]
>>>>>>>>>>>>>>> RIP:
>>>>>>>>>>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>>>>>>>>>>> [ 449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85
>>>>>>>>>>>> 56 ff ff
>>>>>>>>>>>>>>>>>> ff
>>>>>>>>>>>>>>>>>> 45 85 e4 0f
>>>>>>>>>>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48
>>>>>>>>>>>> 8b 40 10
>>>>>>>>>>>>>>>>> <48> 8b
>>>>>>>>>>>>>>> 98
>>>>>>>>>>>>>>>>> c0 00 00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43
>>>>>>>>>>>> 48 a8 01
>>>>>>>>>>>>>>>>>> [ 449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS:
>>>>>>>>>>>> 00010286 [
>>>>>>>>>>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX:
>>>> 0000000000000000
>>>>>>>>>>>> RCX:
>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0
>>>> RSI:
>>>>>>>>>>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [ 449.807224]
>>>> RBP:
>>>>>>>>>>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>> 449.807818] R10: 0000000000000000 R11:
>>>> 0000000000000148
>>>>>>>>>>>> R12:
>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0
>>>> R14:
>>>>>>>>>>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [ 449.809004] FS:
>>>>>>>>>>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>>>>>>>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000
>>>>>>>>>>>> ES: 0000
>>>>>>>>>>>>> CR0:
>>>>>>>>>>>>>>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0
>>>> CR3:
>>>>>>>>>>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [ 449.810747]
>>>> DR0:
>>>>>>>>>>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>> 449.811344] DR3: 0000000000000000 DR6:
>>>> 00000000fffe0ff0 DR7:
>>>>>>>>>>>>>>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>>>>>>>>>>>> [ 449.812206] amdgpu_job_timedout+0x114/0x140
>>>> [amdgpu]
>>>>>>>>>>>>>>>>>> [ 449.812635] drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>>>>> [amd_sched] [ 449.813139]  ?
>>>>>>>>>>>>>>>>>> amdgpu_cgs_destroy_device+0x10/0x10
>>>>>>>>>>>> [amdgpu] [
>>>>>>>>>>>>>>>>>> 449.813609] ? drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>> [amd_sched] [
>>>>>>>>>>>>>>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [ 449.814417]
>>>>>>>>>>>>>>>>>> worker_thread+0x34/0x410 [ 449.814728]
>>>>>>>>>>>> kthread+0x121/0x140 [
>>>>>>>>>>>>>>>>>> 449.815004] ? process_one_work+0x3f0/0x3f0 [
>>>>>>>>>>>> 449.815374]  ?
>>>>>>>>>>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>>>>>>>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> I can't follow how it would be possible for
>>>>>>>>>>>> job->s_fence to be
>>>>>>>>>>>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> So it looks like this patch is just papering over some
>>>>>>>>>>>> bigger issues.
>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>>>>>>>>>>> You need to give me at least enough time to wake up
>>>>>>>>>>>>>>>>>>>>> :)
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>> From: amd-gfx
>>>>>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>>>>>>>>> Behalf
>>>>>>>>>>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>> amd- gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>> From: Christian König
>>>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>>>>>>>>>>> When the job is already signaled, the s_fence is
>>>>>>>>>>>> freed.
>>>>>>>>>>>>>>>>>>>>>>>>> Then it will has null pointer in
>>>>>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job
>>>>>>>>>>>> is destroyed.
>>>>>>>>>>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup.
>>>>>>>>>>>> But in one
>>>>>>>>>>>>>>>>>>>>>>> case, when it enter into the
>>>>>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>>>>>>>>>>>>>>>>> already in drm_sched_job_cleanup, and at this time,
>>>>>>>>>>>> it will
>>>>>>>>>>>>>>>>>>>>>>> go to free
>>>>>>>>>>>>>>>>> job.
>>>>>>>>>>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is
>>>>>>>>>>>> faster. At
>>>>>>>>>>>>>>>>>>>>>>> that time, job is not freed, but s_fence is already
>>>>>>>>>>>> NULL.
>>>>>>>>>>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>>>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>>>>>>>>>> kfree(job);
>>>>>>>>>>>>>>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>>>>> reference
>>>>>>>>>>>>>>>>>>>>> to the
>>>>>>>>>>>>>>>>> s_fence.
>>>>>>>>>>>>>>>>>>>>> So you are just papering over a much bigger problem
>>>>>>>>>>>> here. This
>>>>>>>>>>>>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> When you see a job without an s_fence then that
>>>>>>>>>>>> means the
>>>>>>>>>>>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> |  2
>>>>>>>>>>>>>>>>>>>>>>>>> +- drivers/gpu/drm/scheduler/sched_main.c |
>>>>>>>>>>>>>>>>>>>>>>>>> 11
>>>>>>>>>>>> ++++++---
>>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>>>>>>>>>>>> 2 files changed, 7 insertions(+), 6
>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>> +++
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>              * job->base holds a reference to
>>>>>>>>>>>> parent fence
>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>>>>>>>>>>>>> &&
>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence-
>>>>> parent))
>>>>>>>>>>>>>>>>>>>>>>>>> job_signaled = true;
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>>>>>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(entity, tmp,
>>>>>>>>>>>>>>> &rq-
>>>>>>>>>>>>>>>>>>>> entities,
>>>>>>>>>>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>>>>>>>>>>> - if
>>>>>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>>>>>>>>>>>>> ==
>>>>>>>>>>>>>>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>>>>>>>>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>>>>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>>>>>>>>>> bad->sched-
>>>>>>>>>>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>> (entity-
>>>>>>>>>>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>> * This iteration is thread safe as
>>>>>>>>>>>> sched thread
>>>>>>>>>>>>>>>>>>>>>>>>> is
>>>>>>>>>>>>>>> stopped.
>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>> +          if (s_job->s_fence &&
>>>>>>>>>>>> s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>>>>>>>>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>>>>>>>>> @@ -
>>>>>>>>>>>>>>>>>>> 395,7
>>>>>>>>>>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>> * Job is still alive so fence refcount at
>>>>>>>>>>>>>>> least 1
>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>> + if (s_job->s_fence)
>>>>>>>>>>>>>>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>>>>>>>>>>>>>> finished,
>>>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>> /*
>>>>>>>>>>>>>>>>>>>>>>>>> * We must keep bad job alive for later
>>>>>>>>>>>>>>> use
>>>>>>>>>>>>>>>>>>> during @@
>>>>>>>>>>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>> *sched,
>>>>>>>>>>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>>>>>>>>>> * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>>>>>>>>>>              */
>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>>>>>>>>> - struct dma_fence *fence =
>>>>>>>>>>>> s_job->s_fence->parent;
>>>>>>>>>>>>>>>>>>>>>>>>> + struct dma_fence *fence =
>>>>>>>>>>>> s_job->s_fence ?
>>>>>>>>>>>>>>>>>>>>>>>>> + s_job-
>>>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>> _______________________________________________
>>>>>>>>>>>>>>>>>>>>>>> amd-gfx mailing list amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>> _______________________________________________
>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-18 16:23                                                                                                         ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-18 16:23 UTC (permalink / raw)
  To: christian.koenig, Deng, Emily, amd-gfx

Can you explain why ? As I see it - list_empty_careful is specifically 
designed for the case where the only other concurrent operation in 
progress is list_del_init 
(https://www.kernel.org/doc/htmldocs/kernel-api/API-list-empty-careful.html) 
- which is exactly what happens in this patch, no other list altering 
operation can take place concurrently - so it looks safe to use for me.

Andrey

On 11/18/19 11:16 AM, Christian König wrote:
> Hi Andrey,
>
> the only thing which doesn't looks so good is the switch to 
> list_empty_careful in drm_sched_cleanup_jobs.
>
> We either take the lock here or we don't, but please not that extra 
> checking.
>
> Christian.
>
> Am 18.11.19 um 15:07 schrieb Andrey Grodzovsky:
>> Thanks Emily.
>>
>> Christan - ping for review.
>>
>> Andrey
>>
>> On 11/14/19 11:39 PM, Deng, Emily wrote:
>>> Hi Andrey,
>>>       Currently, I am busying with another issue, maybe will try 
>>> next week.
>>>
>>> Best wishes
>>> Emily Deng
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>> Sent: Friday, November 15, 2019 6:14 AM
>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>
>>>> Attached.
>>>>
>>>> Emily - can you give it a try ?
>>>>
>>>> Andrey
>>>>
>>>> On 11/14/19 3:12 AM, Christian König wrote:
>>>>>> What about instead of peeking at the job to actually remove it from
>>>>>> ring_mirror_list right there,
>>>>> Also an interesting idea. We would need to protect the mirror list
>>>>> with a lock again, but that should be the lesser evil.
>>>>>
>>>>> Maybe prototype that and see if it works or not.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>>>>>
>>>>>> On 11/13/19 9:20 AM, Christian König wrote:
>>>>>>> Another more fundamental question: Could we get rid of the timeout
>>>>>>> job at all?
>>>>>>
>>>>>> There are other stuff there besides picking the first unfinished job
>>>>>> which is common for all the drivers - such as freeing guilty 
>>>>>> signaled
>>>>>> job and rearming the timeout work timer.
>>>>>>
>>>>>>
>>>>>>> I mean we used to give this as parameter to the scheduler callback
>>>>>>> because we had the timeout worker in the job, but that is no longer
>>>>>>> the case.
>>>>>>>
>>>>>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>>>>>          job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>>>                                         struct drm_sched_job, 
>>>>>>>> node);
>>>>>>> Why don't we just remove that here and only get the first job after
>>>>>>> we have stopped the scheduler?
>>>>>>
>>>>>> Should be ok since we have the extra check for __kthread_should_park
>>>>>> in drm_sched_cleanup_jobs which will protect us in this case from a
>>>>>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs
>>>>>> after we already parked it. The problem here is we need the
>>>>>> drm_sched_job to access the private data for each client driver (see
>>>>>> amdgpu_job_timedout for example). What about instead of peeking at
>>>>>> the job to actually remove it from ring_mirror_list right there, go
>>>>>> ahead with it through the reset routine, if it's signaled in the
>>>>>> meanwhile that great - release it, otherwise put it back into
>>>>>> ring_mirror_list in drm_sched_resubmit_jobs.
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>>>>> This why I asked for a trace with timer enabled, but since 
>>>>>>>> there is
>>>>>>>> a finite number of places we touch the timer Emily can just put
>>>>>>>> prints there. Also, I wonder if this temp fix helps her with the
>>>>>>>> issue or not.
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>>>>>> The question is where do we rearm the timer for this problem to
>>>>>>>>> occur?
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>>>>> I was able to reproduce the crash by using the attached
>>>>>>>>>> simulate_crash.patch - waiting on guilty job to signal in reset
>>>>>>>>>> work and artificially rearming the timeout timer just before the
>>>>>>>>>> check for !cancel_delayed_work(&sched->work_tdr) in
>>>>>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. This I
>>>>>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>>>>>
>>>>>>>>>> basic_fix.patch handles this by testing whether another timer
>>>>>>>>>> already armed ob this scheduler or is there a timeout work in
>>>>>>>>>> execution right now (see documentation for work_busy) - 
>>>>>>>>>> obviously
>>>>>>>>>> this is not a full solution as this will not protect from races
>>>>>>>>>> if for example there is immediate work scheduling such as in
>>>>>>>>>> drm_sched_fault -  so we probably need to account for this by
>>>>>>>>>> making drm_sched_cleanup_jobs (at least in the part where it
>>>>>>>>>> iterates ring mirror list and frees jobs) and GPU reset really
>>>>>>>>>> mutually exclusive and not like now.
>>>>>>>>>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>
>>>>>>>>>>> you need to print which scheduler instance is freeing the jobs
>>>>>>>>>>> and which one is triggering the reset. The TID and PID is
>>>>>>>>>>> completely meaningless here since we are called from different
>>>>>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>>>>>
>>>>>>>>>>> Apart from that I will look into this a bit deeper when I have
>>>>>>>>>>> time.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>      I add the follow print in function 
>>>>>>>>>>>> drm_sched_cleanup_jobs.
>>>>>>>>>>>>  From the log it shows that only use cancel_delayed_work could
>>>>>>>>>>>> not avoid to free job when the sched is in reset. But don’t
>>>>>>>>>>>> know exactly where it is wrong about the driver. Do you have
>>>>>>>>>>>> any suggestion about this?
>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu,
>>>>>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>>>>>          /*
>>>>>>>>>>>>           * Don't destroy jobs while the timeout worker is
>>>>>>>>>>>> running  OR thread
>>>>>>>>>>>>           * is being parked and hence assumed to not touch
>>>>>>>>>>>> ring_mirror_list
>>>>>>>>>>>>           */
>>>>>>>>>>>>           if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>>>>>                  return;
>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n",
>>>>>>>>>>>> current->tgid, current->pid);
>>>>>>>>>>>> Best wishes
>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>> ring
>>>>>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587 Nov 12
>>>>>>>>>>>> 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>>>>>> Process information: process pid 0 thread pid 0,
>>>>>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603 Nov 12 12:58:20
>>>>>>>>>>>> ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, 
>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253,
>>>>>>>>>>>> pid:2253
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, pid:2262
>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, 
>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>>>>>>> Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>>>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>> Thinking more about this claim - we assume here that if
>>>>>>>>>>>> cancel_delayed_work
>>>>>>>>>>>>> returned true it guarantees that timeout work is not running
>>>>>>>>>>>> but, it merely
>>>>>>>>>>>>> means there was a pending timeout work which was removed
>>>> from
>>>>>>>>>>>>> the workqueue before it's timer elapsed and so it didn't have
>>>>>>>>>>>>> a
>>>>>>>>>>>> chance to be
>>>>>>>>>>>>> dequeued and executed, it doesn't cover already executing
>>>>>>>>>>>> work. So there is a
>>>>>>>>>>>>> possibility where while timeout work started executing 
>>>>>>>>>>>>> another
>>>>>>>>>>>> timeout work
>>>>>>>>>>>>> already got enqueued (maybe through earlier cleanup jobs or
>>>>>>>>>>>> through
>>>>>>>>>>>>> drm_sched_fault) and if at this point another
>>>>>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>>>>>>> cancel_delayed_work(&sched->work_tdr) will return true even
>>>>>>>>>>>> while there is a
>>>>>>>>>>>>> timeout job in progress.
>>>>>>>>>>>>> Unfortunately we cannot change cancel_delayed_work to
>>>>>>>>>>>>> cancel_delayed_work_sync to flush the timeout work as timeout
>>>>>>>>>>>> work itself
>>>>>>>>>>>>> waits for schedule thread  to be parked again when calling
>>>>>>>>>>>> park_thread.
>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>
>>>>>>>>>>>>> ________________________________________
>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on
>>>>>>>>>>>> behalf of
>>>>>>>>>>>>> Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>> Sent: 08 November 2019 05:35:18
>>>>>>>>>>>>> To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>
>>>>>>>>>>>>> exactly that can't happen. See here:
>>>>>>>>>>>>>
>>>>>>>>>>>>>>           /* Don't destroy jobs while the timeout worker is
>>>>>>>>>>>> running */
>>>>>>>>>>>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>>>>>>>>                   return NULL;
>>>>>>>>>>>>> We never free jobs while the timeout working is running to
>>>>>>>>>>>> prevent exactly
>>>>>>>>>>>>> that issue.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>         The drm_sched_job_timedout-> amdgpu_job_timedout 
>>>>>>>>>>>>>> call
>>>>>>>>>>>>> amdgpu_device_gpu_recover. I mean the main scheduler free the
>>>>>>>>>>>> jobs while
>>>>>>>>>>>>> in amdgpu_device_gpu_recover, and before calling
>>>> drm_sched_stop.
>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> well who is calling amdgpu_device_gpu_recover() in this 
>>>>>>>>>>>>>>> case?
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> When it's not the scheduler we shouldn't have a guilty job
>>>>>>>>>>>> in the first place.
>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>>>>>>>>>> Hi Chrisitan,
>>>>>>>>>>>>>>>>          No, I am with the new branch and also has the
>>>>>>>>>>>> patch. Even it
>>>>>>>>>>>>>>>> are freed by
>>>>>>>>>>>>>>> main scheduler, how we could avoid main scheduler to free
>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>>> enter to function amdgpu_device_gpu_recover?
>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> in this case you are on an old code branch.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Jobs are freed now by the main scheduler thread and only
>>>>>>>>>>>> if no
>>>>>>>>>>>>>>>>> timeout handler is running.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> See this patch here:
>>>>>>>>>>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>>>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>>>           Please refer to follow log, when it enter to
>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>>>>>>>>>>> function, the bad job 000000005086879e is freeing in
>>>>>>>>>>>> function
>>>>>>>>>>>>>>>>> amdgpu_job_free_cb at the same time, because of the
>>>>>>>>>>>> hardware fence
>>>>>>>>>>>>>>> signal.
>>>>>>>>>>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this case,
>>>>>>>>>>>>>>>>> the s_fence is already freed, but job is not freed in 
>>>>>>>>>>>>>>>>> time.
>>>>>>>>>>>> Then this issue
>>>>>>>>>>>>> occurs.
>>>>>>>>>>>>>>>>>> [ 449.792189] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>>>>>> *ERROR* ring
>>>>>>>>>>>>>>> sdma0
>>>>>>>>>>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [
>>>>>>>>>>>>>>>>>> 449.793202] [drm:amdgpu_job_timedout [amdgpu]]
>>>> *ERROR*
>>>>>>>>>>>>>>>>>> Process
>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>> process  pid 0 thread pid 0, s_job:000000005086879e [
>>>>>>>>>>>> 449.794163]
>>>>>>>>>>>>>>>>> amdgpu
>>>>>>>>>>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>>>>> [ 449.794175] Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>>>>> 449.794221]
>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>> thread pid 0, s_job:0000000066eb74ab [ 449.794222]
>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000d4438ad9 [ 449.794255]
>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000b6d69c65 [ 449.794257]
>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>> thread pid 0,
>>>>>>>>>>>>>>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>>>> information: process pid 0 thread pid 0,
>>>>>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>>>>>>>>>>> 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>>>>>> dereference
>>>>>>>>>>>>>>>>> at
>>>>>>>>>>>>>>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0
>>>> [  449.801040]
>>>>>>>>>>>> Oops:
>>>>>>>>>>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>>>>>>>>>>> [ 449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted:
>>>>>>>>>>>> G OE
>>>>>>>>>>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>>>>>>>>>>> [ 449.802157] Hardware name: QEMU Standard PC (i440FX
>>>> +
>>>>>>>>>>>> PIIX,
>>>>>>>>>>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [
>>>>>>>>>>>>>>>>>> 449.802944]
>>>>>>>>>>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched]
>>>> [
>>>>>>>>>>>>>>>>>> 449.803488]
>>>>>>>>>>>>>>> RIP:
>>>>>>>>>>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>>>>>>>>>>> [ 449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85
>>>>>>>>>>>> 56 ff ff
>>>>>>>>>>>>>>>>>> ff
>>>>>>>>>>>>>>>>>> 45 85 e4 0f
>>>>>>>>>>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48
>>>>>>>>>>>> 8b 40 10
>>>>>>>>>>>>>>>>> <48> 8b
>>>>>>>>>>>>>>> 98
>>>>>>>>>>>>>>>>> c0 00 00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43
>>>>>>>>>>>> 48 a8 01
>>>>>>>>>>>>>>>>>> [ 449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS:
>>>>>>>>>>>> 00010286 [
>>>>>>>>>>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX:
>>>> 0000000000000000
>>>>>>>>>>>> RCX:
>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0
>>>> RSI:
>>>>>>>>>>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [ 449.807224]
>>>> RBP:
>>>>>>>>>>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>> 449.807818] R10: 0000000000000000 R11:
>>>> 0000000000000148
>>>>>>>>>>>> R12:
>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0
>>>> R14:
>>>>>>>>>>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [ 449.809004] FS:
>>>>>>>>>>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>>>>>>>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000
>>>>>>>>>>>> ES: 0000
>>>>>>>>>>>>> CR0:
>>>>>>>>>>>>>>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0
>>>> CR3:
>>>>>>>>>>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [ 449.810747]
>>>> DR0:
>>>>>>>>>>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>> 449.811344] DR3: 0000000000000000 DR6:
>>>> 00000000fffe0ff0 DR7:
>>>>>>>>>>>>>>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>>>>>>>>>>>> [ 449.812206] amdgpu_job_timedout+0x114/0x140
>>>> [amdgpu]
>>>>>>>>>>>>>>>>>> [ 449.812635] drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>>>>> [amd_sched] [ 449.813139]  ?
>>>>>>>>>>>>>>>>>> amdgpu_cgs_destroy_device+0x10/0x10
>>>>>>>>>>>> [amdgpu] [
>>>>>>>>>>>>>>>>>> 449.813609] ? drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>> [amd_sched] [
>>>>>>>>>>>>>>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [ 449.814417]
>>>>>>>>>>>>>>>>>> worker_thread+0x34/0x410 [ 449.814728]
>>>>>>>>>>>> kthread+0x121/0x140 [
>>>>>>>>>>>>>>>>>> 449.815004] ? process_one_work+0x3f0/0x3f0 [
>>>>>>>>>>>> 449.815374]  ?
>>>>>>>>>>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>>>>>>>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> I can't follow how it would be possible for
>>>>>>>>>>>> job->s_fence to be
>>>>>>>>>>>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> So it looks like this patch is just papering over some
>>>>>>>>>>>> bigger issues.
>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>>>>>>>>>>> You need to give me at least enough time to wake up
>>>>>>>>>>>>>>>>>>>>> :)
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>> From: amd-gfx
>>>>>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>>>>>>>>> Behalf
>>>>>>>>>>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>> amd- gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>> From: Christian König
>>>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>>>>>>>>>>> When the job is already signaled, the s_fence is
>>>>>>>>>>>> freed.
>>>>>>>>>>>>>>>>>>>>>>>>> Then it will has null pointer in
>>>>>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job
>>>>>>>>>>>> is destroyed.
>>>>>>>>>>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup.
>>>>>>>>>>>> But in one
>>>>>>>>>>>>>>>>>>>>>>> case, when it enter into the
>>>>>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>>>>>>>>>>>>>>>>> already in drm_sched_job_cleanup, and at this time,
>>>>>>>>>>>> it will
>>>>>>>>>>>>>>>>>>>>>>> go to free
>>>>>>>>>>>>>>>>> job.
>>>>>>>>>>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is
>>>>>>>>>>>> faster. At
>>>>>>>>>>>>>>>>>>>>>>> that time, job is not freed, but s_fence is already
>>>>>>>>>>>> NULL.
>>>>>>>>>>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>>>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>>>>>>>>>> kfree(job);
>>>>>>>>>>>>>>>>>>>>> The job itself is freed up directly after freeing the
>>>>>>>>>>>> reference
>>>>>>>>>>>>>>>>>>>>> to the
>>>>>>>>>>>>>>>>> s_fence.
>>>>>>>>>>>>>>>>>>>>> So you are just papering over a much bigger problem
>>>>>>>>>>>> here. This
>>>>>>>>>>>>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> When you see a job without an s_fence then that
>>>>>>>>>>>> means the
>>>>>>>>>>>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> |  2
>>>>>>>>>>>>>>>>>>>>>>>>> +- drivers/gpu/drm/scheduler/sched_main.c |
>>>>>>>>>>>>>>>>>>>>>>>>> 11
>>>>>>>>>>>> ++++++---
>>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>>>>>>>>>>>> 2 files changed, 7 insertions(+), 6
>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>> +++
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>              * job->base holds a reference to
>>>>>>>>>>>> parent fence
>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>>>>>>>>>>>>> &&
>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence-
>>>>> parent))
>>>>>>>>>>>>>>>>>>>>>>>>> job_signaled = true;
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>>>>>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(entity, tmp,
>>>>>>>>>>>>>>> &rq-
>>>>>>>>>>>>>>>>>>>> entities,
>>>>>>>>>>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>>>>>>>>>>> - if
>>>>>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>>>>>>>>>>>>> ==
>>>>>>>>>>>>>>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>>>>>>>>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>>>>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>>>>>>>>>> bad->sched-
>>>>>>>>>>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>> (entity-
>>>>>>>>>>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>> * This iteration is thread safe as
>>>>>>>>>>>> sched thread
>>>>>>>>>>>>>>>>>>>>>>>>> is
>>>>>>>>>>>>>>> stopped.
>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>> +          if (s_job->s_fence &&
>>>>>>>>>>>> s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>>>>>>>>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>>>>>>>>> @@ -
>>>>>>>>>>>>>>>>>>> 395,7
>>>>>>>>>>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>> * Job is still alive so fence refcount at
>>>>>>>>>>>>>>> least 1
>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>> + if (s_job->s_fence)
>>>>>>>>>>>>>>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>>>>>>>>>>>>>> finished,
>>>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>> /*
>>>>>>>>>>>>>>>>>>>>>>>>> * We must keep bad job alive for later
>>>>>>>>>>>>>>> use
>>>>>>>>>>>>>>>>>>> during @@
>>>>>>>>>>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>> *sched,
>>>>>>>>>>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>>>>>>>>>> * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>>>>>>>>>>              */
>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>>>>>>>>> - struct dma_fence *fence =
>>>>>>>>>>>> s_job->s_fence->parent;
>>>>>>>>>>>>>>>>>>>>>>>>> + struct dma_fence *fence =
>>>>>>>>>>>> s_job->s_fence ?
>>>>>>>>>>>>>>>>>>>>>>>>> + s_job-
>>>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>> _______________________________________________
>>>>>>>>>>>>>>>>>>>>>>> amd-gfx mailing list amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>> _______________________________________________
>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-18 16:44                                                                                                             ` Christian König
  0 siblings, 0 replies; 80+ messages in thread
From: Christian König @ 2019-11-18 16:44 UTC (permalink / raw)
  To: Andrey Grodzovsky, Deng, Emily, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

list_empty_careful() should only be used for optimizing cases, but never 
if you need to rely on the result.

The problem is that the function doesn't has any memory barriers 
whatsoever, it just checks if the next and prev pointer are both empty 
instead of just the next pointer.

Christian.

Am 18.11.19 um 17:23 schrieb Andrey Grodzovsky:
> Can you explain why ? As I see it - list_empty_careful is specifically 
> designed for the case where the only other concurrent operation in 
> progress is list_del_init 
> (https://www.kernel.org/doc/htmldocs/kernel-api/API-list-empty-careful.html) 
> - which is exactly what happens in this patch, no other list altering 
> operation can take place concurrently - so it looks safe to use for me.
>
> Andrey
>
> On 11/18/19 11:16 AM, Christian König wrote:
>> Hi Andrey,
>>
>> the only thing which doesn't looks so good is the switch to 
>> list_empty_careful in drm_sched_cleanup_jobs.
>>
>> We either take the lock here or we don't, but please not that extra 
>> checking.
>>
>> Christian.
>>
>> Am 18.11.19 um 15:07 schrieb Andrey Grodzovsky:
>>> Thanks Emily.
>>>
>>> Christan - ping for review.
>>>
>>> Andrey
>>>
>>> On 11/14/19 11:39 PM, Deng, Emily wrote:
>>>> Hi Andrey,
>>>>       Currently, I am busying with another issue, maybe will try 
>>>> next week.
>>>>
>>>> Best wishes
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>> Sent: Friday, November 15, 2019 6:14 AM
>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Attached.
>>>>>
>>>>> Emily - can you give it a try ?
>>>>>
>>>>> Andrey
>>>>>
>>>>> On 11/14/19 3:12 AM, Christian König wrote:
>>>>>>> What about instead of peeking at the job to actually remove it from
>>>>>>> ring_mirror_list right there,
>>>>>> Also an interesting idea. We would need to protect the mirror list
>>>>>> with a lock again, but that should be the lesser evil.
>>>>>>
>>>>>> Maybe prototype that and see if it works or not.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>>>>>>
>>>>>>> On 11/13/19 9:20 AM, Christian König wrote:
>>>>>>>> Another more fundamental question: Could we get rid of the timeout
>>>>>>>> job at all?
>>>>>>>
>>>>>>> There are other stuff there besides picking the first unfinished 
>>>>>>> job
>>>>>>> which is common for all the drivers - such as freeing guilty 
>>>>>>> signaled
>>>>>>> job and rearming the timeout work timer.
>>>>>>>
>>>>>>>
>>>>>>>> I mean we used to give this as parameter to the scheduler callback
>>>>>>>> because we had the timeout worker in the job, but that is no 
>>>>>>>> longer
>>>>>>>> the case.
>>>>>>>>
>>>>>>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>>>>>>          job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>>>>                                         struct drm_sched_job, 
>>>>>>>>> node);
>>>>>>>> Why don't we just remove that here and only get the first job 
>>>>>>>> after
>>>>>>>> we have stopped the scheduler?
>>>>>>>
>>>>>>> Should be ok since we have the extra check for 
>>>>>>> __kthread_should_park
>>>>>>> in drm_sched_cleanup_jobs which will protect us in this case from a
>>>>>>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs
>>>>>>> after we already parked it. The problem here is we need the
>>>>>>> drm_sched_job to access the private data for each client driver 
>>>>>>> (see
>>>>>>> amdgpu_job_timedout for example). What about instead of peeking at
>>>>>>> the job to actually remove it from ring_mirror_list right there, go
>>>>>>> ahead with it through the reset routine, if it's signaled in the
>>>>>>> meanwhile that great - release it, otherwise put it back into
>>>>>>> ring_mirror_list in drm_sched_resubmit_jobs.
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>>>>>> This why I asked for a trace with timer enabled, but since 
>>>>>>>>> there is
>>>>>>>>> a finite number of places we touch the timer Emily can just put
>>>>>>>>> prints there. Also, I wonder if this temp fix helps her with the
>>>>>>>>> issue or not.
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>>>>>>> The question is where do we rearm the timer for this problem to
>>>>>>>>>> occur?
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>>>>>> I was able to reproduce the crash by using the attached
>>>>>>>>>>> simulate_crash.patch - waiting on guilty job to signal in reset
>>>>>>>>>>> work and artificially rearming the timeout timer just before 
>>>>>>>>>>> the
>>>>>>>>>>> check for !cancel_delayed_work(&sched->work_tdr) in
>>>>>>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. 
>>>>>>>>>>> This I
>>>>>>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>>>>>>
>>>>>>>>>>> basic_fix.patch handles this by testing whether another timer
>>>>>>>>>>> already armed ob this scheduler or is there a timeout work in
>>>>>>>>>>> execution right now (see documentation for work_busy) - 
>>>>>>>>>>> obviously
>>>>>>>>>>> this is not a full solution as this will not protect from races
>>>>>>>>>>> if for example there is immediate work scheduling such as in
>>>>>>>>>>> drm_sched_fault -  so we probably need to account for this by
>>>>>>>>>>> making drm_sched_cleanup_jobs (at least in the part where it
>>>>>>>>>>> iterates ring mirror list and frees jobs) and GPU reset really
>>>>>>>>>>> mutually exclusive and not like now.
>>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>
>>>>>>>>>>>> you need to print which scheduler instance is freeing the jobs
>>>>>>>>>>>> and which one is triggering the reset. The TID and PID is
>>>>>>>>>>>> completely meaningless here since we are called from different
>>>>>>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>>>>>>
>>>>>>>>>>>> Apart from that I will look into this a bit deeper when I have
>>>>>>>>>>>> time.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>      I add the follow print in function 
>>>>>>>>>>>>> drm_sched_cleanup_jobs.
>>>>>>>>>>>>>  From the log it shows that only use cancel_delayed_work 
>>>>>>>>>>>>> could
>>>>>>>>>>>>> not avoid to free job when the sched is in reset. But don’t
>>>>>>>>>>>>> know exactly where it is wrong about the driver. Do you have
>>>>>>>>>>>>> any suggestion about this?
>>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu,
>>>>>>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>>>>>>          /*
>>>>>>>>>>>>>           * Don't destroy jobs while the timeout worker is
>>>>>>>>>>>>> running  OR thread
>>>>>>>>>>>>>           * is being parked and hence assumed to not touch
>>>>>>>>>>>>> ring_mirror_list
>>>>>>>>>>>>>           */
>>>>>>>>>>>>>           if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>>>>>>                  return;
>>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n",
>>>>>>>>>>>>> current->tgid, current->pid);
>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>> ring
>>>>>>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587 Nov 12
>>>>>>>>>>>>> 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>>>>>>> Process information: process pid 0 thread pid 0,
>>>>>>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603 Nov 12 12:58:20
>>>>>>>>>>>>> ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, 
>>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253,
>>>>>>>>>>>>> pid:2253
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, 
>>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>>>>>>>> Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, 
>>>>>>>>>>>>>> Emily
>>>>>>>>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>> Thinking more about this claim - we assume here that if
>>>>>>>>>>>>> cancel_delayed_work
>>>>>>>>>>>>>> returned true it guarantees that timeout work is not running
>>>>>>>>>>>>> but, it merely
>>>>>>>>>>>>>> means there was a pending timeout work which was removed
>>>>> from
>>>>>>>>>>>>>> the workqueue before it's timer elapsed and so it didn't 
>>>>>>>>>>>>>> have
>>>>>>>>>>>>>> a
>>>>>>>>>>>>> chance to be
>>>>>>>>>>>>>> dequeued and executed, it doesn't cover already executing
>>>>>>>>>>>>> work. So there is a
>>>>>>>>>>>>>> possibility where while timeout work started executing 
>>>>>>>>>>>>>> another
>>>>>>>>>>>>> timeout work
>>>>>>>>>>>>>> already got enqueued (maybe through earlier cleanup jobs or
>>>>>>>>>>>>> through
>>>>>>>>>>>>>> drm_sched_fault) and if at this point another
>>>>>>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>>>>>>>> cancel_delayed_work(&sched->work_tdr) will return true even
>>>>>>>>>>>>> while there is a
>>>>>>>>>>>>>> timeout job in progress.
>>>>>>>>>>>>>> Unfortunately we cannot change cancel_delayed_work to
>>>>>>>>>>>>>> cancel_delayed_work_sync to flush the timeout work as 
>>>>>>>>>>>>>> timeout
>>>>>>>>>>>>> work itself
>>>>>>>>>>>>>> waits for schedule thread  to be parked again when calling
>>>>>>>>>>>>> park_thread.
>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> ________________________________________
>>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on
>>>>>>>>>>>>> behalf of
>>>>>>>>>>>>>> Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>> Sent: 08 November 2019 05:35:18
>>>>>>>>>>>>>> To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> exactly that can't happen. See here:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>           /* Don't destroy jobs while the timeout worker is
>>>>>>>>>>>>> running */
>>>>>>>>>>>>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>>>>>>>>>                   return NULL;
>>>>>>>>>>>>>> We never free jobs while the timeout working is running to
>>>>>>>>>>>>> prevent exactly
>>>>>>>>>>>>>> that issue.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>         The drm_sched_job_timedout-> amdgpu_job_timedout 
>>>>>>>>>>>>>>> call
>>>>>>>>>>>>>> amdgpu_device_gpu_recover. I mean the main scheduler free 
>>>>>>>>>>>>>> the
>>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>> in amdgpu_device_gpu_recover, and before calling
>>>>> drm_sched_stop.
>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>>>>>>>>>> issue
>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> well who is calling amdgpu_device_gpu_recover() in this 
>>>>>>>>>>>>>>>> case?
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> When it's not the scheduler we shouldn't have a guilty job
>>>>>>>>>>>>> in the first place.
>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>> Hi Chrisitan,
>>>>>>>>>>>>>>>>>          No, I am with the new branch and also has the
>>>>>>>>>>>>> patch. Even it
>>>>>>>>>>>>>>>>> are freed by
>>>>>>>>>>>>>>>> main scheduler, how we could avoid main scheduler to free
>>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>>>> enter to function amdgpu_device_gpu_recover?
>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> in this case you are on an old code branch.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Jobs are freed now by the main scheduler thread and only
>>>>>>>>>>>>> if no
>>>>>>>>>>>>>>>>>> timeout handler is running.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> See this patch here:
>>>>>>>>>>>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>>>>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>>>>           Please refer to follow log, when it enter to
>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>>>>>>>>>>>> function, the bad job 000000005086879e is freeing in
>>>>>>>>>>>>> function
>>>>>>>>>>>>>>>>>> amdgpu_job_free_cb at the same time, because of the
>>>>>>>>>>>>> hardware fence
>>>>>>>>>>>>>>>> signal.
>>>>>>>>>>>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this case,
>>>>>>>>>>>>>>>>>> the s_fence is already freed, but job is not freed in 
>>>>>>>>>>>>>>>>>> time.
>>>>>>>>>>>>> Then this issue
>>>>>>>>>>>>>> occurs.
>>>>>>>>>>>>>>>>>>> [ 449.792189] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>>>>>>> *ERROR* ring
>>>>>>>>>>>>>>>> sdma0
>>>>>>>>>>>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [
>>>>>>>>>>>>>>>>>>> 449.793202] [drm:amdgpu_job_timedout [amdgpu]]
>>>>> *ERROR*
>>>>>>>>>>>>>>>>>>> Process
>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>> process pid 0 thread pid 0, s_job:000000005086879e [
>>>>>>>>>>>>> 449.794163]
>>>>>>>>>>>>>>>>>> amdgpu
>>>>>>>>>>>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>>>>>> [ 449.794175] Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>>>>>> 449.794221]
>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:0000000066eb74ab [ 449.794222]
>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000d4438ad9 [ 449.794255]
>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000b6d69c65 [ 449.794257]
>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>> thread pid 0,
>>>>>>>>>>>>>>>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>>>>> information: process pid 0 thread pid 0,
>>>>>>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>>>>>>>>>>>> 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>>>>>>> dereference
>>>>>>>>>>>>>>>>>> at
>>>>>>>>>>>>>>>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0
>>>>> [  449.801040]
>>>>>>>>>>>>> Oops:
>>>>>>>>>>>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>>>>>>>>>>>> [ 449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted:
>>>>>>>>>>>>> G OE
>>>>>>>>>>>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>>>>>>>>>>>> [ 449.802157] Hardware name: QEMU Standard PC (i440FX
>>>>> +
>>>>>>>>>>>>> PIIX,
>>>>>>>>>>>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [
>>>>>>>>>>>>>>>>>>> 449.802944]
>>>>>>>>>>>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched]
>>>>> [
>>>>>>>>>>>>>>>>>>> 449.803488]
>>>>>>>>>>>>>>>> RIP:
>>>>>>>>>>>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>>>>>>>>>>>> [ 449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85
>>>>>>>>>>>>> 56 ff ff
>>>>>>>>>>>>>>>>>>> ff
>>>>>>>>>>>>>>>>>>> 45 85 e4 0f
>>>>>>>>>>>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48
>>>>>>>>>>>>> 8b 40 10
>>>>>>>>>>>>>>>>>> <48> 8b
>>>>>>>>>>>>>>>> 98
>>>>>>>>>>>>>>>>>> c0 00 00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43
>>>>>>>>>>>>> 48 a8 01
>>>>>>>>>>>>>>>>>>> [ 449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS:
>>>>>>>>>>>>> 00010286 [
>>>>>>>>>>>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX:
>>>>> 0000000000000000
>>>>>>>>>>>>> RCX:
>>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0
>>>>> RSI:
>>>>>>>>>>>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [ 449.807224]
>>>>> RBP:
>>>>>>>>>>>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>>> 449.807818] R10: 0000000000000000 R11:
>>>>> 0000000000000148
>>>>>>>>>>>>> R12:
>>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0
>>>>> R14:
>>>>>>>>>>>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [ 449.809004] 
>>>>>>>>>>>>>>>>>>> FS:
>>>>>>>>>>>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>>>>>>>>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000
>>>>>>>>>>>>> ES: 0000
>>>>>>>>>>>>>> CR0:
>>>>>>>>>>>>>>>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0
>>>>> CR3:
>>>>>>>>>>>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [ 449.810747]
>>>>> DR0:
>>>>>>>>>>>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>> 449.811344] DR3: 0000000000000000 DR6:
>>>>> 00000000fffe0ff0 DR7:
>>>>>>>>>>>>>>>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>>>>>>>>>>>>> [ 449.812206] amdgpu_job_timedout+0x114/0x140
>>>>> [amdgpu]
>>>>>>>>>>>>>>>>>>> [ 449.812635] drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>>>>>> [amd_sched] [ 449.813139]  ?
>>>>>>>>>>>>>>>>>>> amdgpu_cgs_destroy_device+0x10/0x10
>>>>>>>>>>>>> [amdgpu] [
>>>>>>>>>>>>>>>>>>> 449.813609] ? drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>> [amd_sched] [
>>>>>>>>>>>>>>>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [ 449.814417]
>>>>>>>>>>>>>>>>>>> worker_thread+0x34/0x410 [ 449.814728]
>>>>>>>>>>>>> kthread+0x121/0x140 [
>>>>>>>>>>>>>>>>>>> 449.815004] ? process_one_work+0x3f0/0x3f0 [
>>>>>>>>>>>>> 449.815374]  ?
>>>>>>>>>>>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>>>>>>>>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>>>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> I can't follow how it would be possible for
>>>>>>>>>>>>> job->s_fence to be
>>>>>>>>>>>>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> So it looks like this patch is just papering over some
>>>>>>>>>>>>> bigger issues.
>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>>>>>>>>>>>>>>>> pointer
>>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>>>>>>>>>>>> You need to give me at least enough time to wake up
>>>>>>>>>>>>>>>>>>>>>> :)
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>> From: amd-gfx
>>>>>>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>>>>>>>>>> Behalf
>>>>>>>>>>>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>> amd- gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>>> From: Christian König
>>>>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>>>>>>>>>>>> When the job is already signaled, the s_fence is
>>>>>>>>>>>>> freed.
>>>>>>>>>>>>>>>>>>>>>>>>>> Then it will has null pointer in
>>>>>>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job
>>>>>>>>>>>>> is destroyed.
>>>>>>>>>>>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup.
>>>>>>>>>>>>> But in one
>>>>>>>>>>>>>>>>>>>>>>>> case, when it enter into the
>>>>>>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>>>>>>>>>>>>>>>>>> already in drm_sched_job_cleanup, and at this 
>>>>>>>>>>>>>>>>>>>>>>>> time,
>>>>>>>>>>>>> it will
>>>>>>>>>>>>>>>>>>>>>>>> go to free
>>>>>>>>>>>>>>>>>> job.
>>>>>>>>>>>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is
>>>>>>>>>>>>> faster. At
>>>>>>>>>>>>>>>>>>>>>>>> that time, job is not freed, but s_fence is 
>>>>>>>>>>>>>>>>>>>>>>>> already
>>>>>>>>>>>>> NULL.
>>>>>>>>>>>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>>>>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>>>>>>>>>>> kfree(job);
>>>>>>>>>>>>>>>>>>>>>> The job itself is freed up directly after freeing 
>>>>>>>>>>>>>>>>>>>>>> the
>>>>>>>>>>>>> reference
>>>>>>>>>>>>>>>>>>>>>> to the
>>>>>>>>>>>>>>>>>> s_fence.
>>>>>>>>>>>>>>>>>>>>>> So you are just papering over a much bigger problem
>>>>>>>>>>>>> here. This
>>>>>>>>>>>>>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> When you see a job without an s_fence then that
>>>>>>>>>>>>> means the
>>>>>>>>>>>>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> |  2
>>>>>>>>>>>>>>>>>>>>>>>>>> +- drivers/gpu/drm/scheduler/sched_main.c |
>>>>>>>>>>>>>>>>>>>>>>>>>> 11
>>>>>>>>>>>>> ++++++---
>>>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>>>>>>>>>>>>> 2 files changed, 7 insertions(+), 6
>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>> +++
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>>              * job->base holds a reference to
>>>>>>>>>>>>> parent fence
>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>>>>>>>>>>>>>> &&
>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence-
>>>>>> parent))
>>>>>>>>>>>>>>>>>>>>>>>>>> job_signaled = true;
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>>>>>>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(entity, tmp,
>>>>>>>>>>>>>>>> &rq-
>>>>>>>>>>>>>>>>>>>>> entities,
>>>>>>>>>>>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>>>>>>>>>>>> - if
>>>>>>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>>>>>>>>>>>>>> ==
>>>>>>>>>>>>>>>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>>>>>>>>>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>>>>>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>>>>>>>>>>> bad->sched-
>>>>>>>>>>>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>>> (entity-
>>>>>>>>>>>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>> * This iteration is thread safe as
>>>>>>>>>>>>> sched thread
>>>>>>>>>>>>>>>>>>>>>>>>>> is
>>>>>>>>>>>>>>>> stopped.
>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>> +          if (s_job->s_fence &&
>>>>>>>>>>>>> s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>>>>>>>>>> @@ -
>>>>>>>>>>>>>>>>>>>> 395,7
>>>>>>>>>>>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>> * Job is still alive so fence refcount at
>>>>>>>>>>>>>>>> least 1
>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>>> + if (s_job->s_fence)
>>>>>>>>>>>>>>>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>>>>>>>>>>>>>>> finished,
>>>>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>>> /*
>>>>>>>>>>>>>>>>>>>>>>>>>> * We must keep bad job alive for later
>>>>>>>>>>>>>>>> use
>>>>>>>>>>>>>>>>>>>> during @@
>>>>>>>>>>>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>> *sched,
>>>>>>>>>>>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>>>>>>>>>>> * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>>>>>>>>>>>              */
>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>>>>>>>>>> - struct dma_fence *fence =
>>>>>>>>>>>>> s_job->s_fence->parent;
>>>>>>>>>>>>>>>>>>>>>>>>>> + struct dma_fence *fence =
>>>>>>>>>>>>> s_job->s_fence ?
>>>>>>>>>>>>>>>>>>>>>>>>>> + s_job-
>>>>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>> _______________________________________________
>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx mailing list amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>> _______________________________________________
>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>> _______________________________________________
>>>>>>> amd-gfx mailing list
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-18 16:44                                                                                                             ` Christian König
  0 siblings, 0 replies; 80+ messages in thread
From: Christian König @ 2019-11-18 16:44 UTC (permalink / raw)
  To: Andrey Grodzovsky, Deng, Emily, amd-gfx

list_empty_careful() should only be used for optimizing cases, but never 
if you need to rely on the result.

The problem is that the function doesn't has any memory barriers 
whatsoever, it just checks if the next and prev pointer are both empty 
instead of just the next pointer.

Christian.

Am 18.11.19 um 17:23 schrieb Andrey Grodzovsky:
> Can you explain why ? As I see it - list_empty_careful is specifically 
> designed for the case where the only other concurrent operation in 
> progress is list_del_init 
> (https://www.kernel.org/doc/htmldocs/kernel-api/API-list-empty-careful.html) 
> - which is exactly what happens in this patch, no other list altering 
> operation can take place concurrently - so it looks safe to use for me.
>
> Andrey
>
> On 11/18/19 11:16 AM, Christian König wrote:
>> Hi Andrey,
>>
>> the only thing which doesn't looks so good is the switch to 
>> list_empty_careful in drm_sched_cleanup_jobs.
>>
>> We either take the lock here or we don't, but please not that extra 
>> checking.
>>
>> Christian.
>>
>> Am 18.11.19 um 15:07 schrieb Andrey Grodzovsky:
>>> Thanks Emily.
>>>
>>> Christan - ping for review.
>>>
>>> Andrey
>>>
>>> On 11/14/19 11:39 PM, Deng, Emily wrote:
>>>> Hi Andrey,
>>>>       Currently, I am busying with another issue, maybe will try 
>>>> next week.
>>>>
>>>> Best wishes
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>> Sent: Friday, November 15, 2019 6:14 AM
>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>
>>>>> Attached.
>>>>>
>>>>> Emily - can you give it a try ?
>>>>>
>>>>> Andrey
>>>>>
>>>>> On 11/14/19 3:12 AM, Christian König wrote:
>>>>>>> What about instead of peeking at the job to actually remove it from
>>>>>>> ring_mirror_list right there,
>>>>>> Also an interesting idea. We would need to protect the mirror list
>>>>>> with a lock again, but that should be the lesser evil.
>>>>>>
>>>>>> Maybe prototype that and see if it works or not.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>>>>>>
>>>>>>> On 11/13/19 9:20 AM, Christian König wrote:
>>>>>>>> Another more fundamental question: Could we get rid of the timeout
>>>>>>>> job at all?
>>>>>>>
>>>>>>> There are other stuff there besides picking the first unfinished 
>>>>>>> job
>>>>>>> which is common for all the drivers - such as freeing guilty 
>>>>>>> signaled
>>>>>>> job and rearming the timeout work timer.
>>>>>>>
>>>>>>>
>>>>>>>> I mean we used to give this as parameter to the scheduler callback
>>>>>>>> because we had the timeout worker in the job, but that is no 
>>>>>>>> longer
>>>>>>>> the case.
>>>>>>>>
>>>>>>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>>>>>>          job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>>>>                                         struct drm_sched_job, 
>>>>>>>>> node);
>>>>>>>> Why don't we just remove that here and only get the first job 
>>>>>>>> after
>>>>>>>> we have stopped the scheduler?
>>>>>>>
>>>>>>> Should be ok since we have the extra check for 
>>>>>>> __kthread_should_park
>>>>>>> in drm_sched_cleanup_jobs which will protect us in this case from a
>>>>>>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs
>>>>>>> after we already parked it. The problem here is we need the
>>>>>>> drm_sched_job to access the private data for each client driver 
>>>>>>> (see
>>>>>>> amdgpu_job_timedout for example). What about instead of peeking at
>>>>>>> the job to actually remove it from ring_mirror_list right there, go
>>>>>>> ahead with it through the reset routine, if it's signaled in the
>>>>>>> meanwhile that great - release it, otherwise put it back into
>>>>>>> ring_mirror_list in drm_sched_resubmit_jobs.
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>>>>>> This why I asked for a trace with timer enabled, but since 
>>>>>>>>> there is
>>>>>>>>> a finite number of places we touch the timer Emily can just put
>>>>>>>>> prints there. Also, I wonder if this temp fix helps her with the
>>>>>>>>> issue or not.
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>>>>>>> The question is where do we rearm the timer for this problem to
>>>>>>>>>> occur?
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>>>>>> I was able to reproduce the crash by using the attached
>>>>>>>>>>> simulate_crash.patch - waiting on guilty job to signal in reset
>>>>>>>>>>> work and artificially rearming the timeout timer just before 
>>>>>>>>>>> the
>>>>>>>>>>> check for !cancel_delayed_work(&sched->work_tdr) in
>>>>>>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. 
>>>>>>>>>>> This I
>>>>>>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>>>>>>
>>>>>>>>>>> basic_fix.patch handles this by testing whether another timer
>>>>>>>>>>> already armed ob this scheduler or is there a timeout work in
>>>>>>>>>>> execution right now (see documentation for work_busy) - 
>>>>>>>>>>> obviously
>>>>>>>>>>> this is not a full solution as this will not protect from races
>>>>>>>>>>> if for example there is immediate work scheduling such as in
>>>>>>>>>>> drm_sched_fault -  so we probably need to account for this by
>>>>>>>>>>> making drm_sched_cleanup_jobs (at least in the part where it
>>>>>>>>>>> iterates ring mirror list and frees jobs) and GPU reset really
>>>>>>>>>>> mutually exclusive and not like now.
>>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>
>>>>>>>>>>>> you need to print which scheduler instance is freeing the jobs
>>>>>>>>>>>> and which one is triggering the reset. The TID and PID is
>>>>>>>>>>>> completely meaningless here since we are called from different
>>>>>>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>>>>>>
>>>>>>>>>>>> Apart from that I will look into this a bit deeper when I have
>>>>>>>>>>>> time.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>      I add the follow print in function 
>>>>>>>>>>>>> drm_sched_cleanup_jobs.
>>>>>>>>>>>>>  From the log it shows that only use cancel_delayed_work 
>>>>>>>>>>>>> could
>>>>>>>>>>>>> not avoid to free job when the sched is in reset. But don’t
>>>>>>>>>>>>> know exactly where it is wrong about the driver. Do you have
>>>>>>>>>>>>> any suggestion about this?
>>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu,
>>>>>>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>>>>>>          /*
>>>>>>>>>>>>>           * Don't destroy jobs while the timeout worker is
>>>>>>>>>>>>> running  OR thread
>>>>>>>>>>>>>           * is being parked and hence assumed to not touch
>>>>>>>>>>>>> ring_mirror_list
>>>>>>>>>>>>>           */
>>>>>>>>>>>>>           if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>>>>>>                  return;
>>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n",
>>>>>>>>>>>>> current->tgid, current->pid);
>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>> ring
>>>>>>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587 Nov 12
>>>>>>>>>>>>> 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>>>>>>> Process information: process pid 0 thread pid 0,
>>>>>>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603 Nov 12 12:58:20
>>>>>>>>>>>>> ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, 
>>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253,
>>>>>>>>>>>>> pid:2253
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, 
>>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>>>>>>>> Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, 
>>>>>>>>>>>>>> Emily
>>>>>>>>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>> Thinking more about this claim - we assume here that if
>>>>>>>>>>>>> cancel_delayed_work
>>>>>>>>>>>>>> returned true it guarantees that timeout work is not running
>>>>>>>>>>>>> but, it merely
>>>>>>>>>>>>>> means there was a pending timeout work which was removed
>>>>> from
>>>>>>>>>>>>>> the workqueue before it's timer elapsed and so it didn't 
>>>>>>>>>>>>>> have
>>>>>>>>>>>>>> a
>>>>>>>>>>>>> chance to be
>>>>>>>>>>>>>> dequeued and executed, it doesn't cover already executing
>>>>>>>>>>>>> work. So there is a
>>>>>>>>>>>>>> possibility where while timeout work started executing 
>>>>>>>>>>>>>> another
>>>>>>>>>>>>> timeout work
>>>>>>>>>>>>>> already got enqueued (maybe through earlier cleanup jobs or
>>>>>>>>>>>>> through
>>>>>>>>>>>>>> drm_sched_fault) and if at this point another
>>>>>>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>>>>>>>> cancel_delayed_work(&sched->work_tdr) will return true even
>>>>>>>>>>>>> while there is a
>>>>>>>>>>>>>> timeout job in progress.
>>>>>>>>>>>>>> Unfortunately we cannot change cancel_delayed_work to
>>>>>>>>>>>>>> cancel_delayed_work_sync to flush the timeout work as 
>>>>>>>>>>>>>> timeout
>>>>>>>>>>>>> work itself
>>>>>>>>>>>>>> waits for schedule thread  to be parked again when calling
>>>>>>>>>>>>> park_thread.
>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> ________________________________________
>>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on
>>>>>>>>>>>>> behalf of
>>>>>>>>>>>>>> Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>> Sent: 08 November 2019 05:35:18
>>>>>>>>>>>>>> To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> exactly that can't happen. See here:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>           /* Don't destroy jobs while the timeout worker is
>>>>>>>>>>>>> running */
>>>>>>>>>>>>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>>>>>>>>>                   return NULL;
>>>>>>>>>>>>>> We never free jobs while the timeout working is running to
>>>>>>>>>>>>> prevent exactly
>>>>>>>>>>>>>> that issue.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>         The drm_sched_job_timedout-> amdgpu_job_timedout 
>>>>>>>>>>>>>>> call
>>>>>>>>>>>>>> amdgpu_device_gpu_recover. I mean the main scheduler free 
>>>>>>>>>>>>>> the
>>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>> in amdgpu_device_gpu_recover, and before calling
>>>>> drm_sched_stop.
>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>>>>>>>>>> issue
>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> well who is calling amdgpu_device_gpu_recover() in this 
>>>>>>>>>>>>>>>> case?
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> When it's not the scheduler we shouldn't have a guilty job
>>>>>>>>>>>>> in the first place.
>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>> Hi Chrisitan,
>>>>>>>>>>>>>>>>>          No, I am with the new branch and also has the
>>>>>>>>>>>>> patch. Even it
>>>>>>>>>>>>>>>>> are freed by
>>>>>>>>>>>>>>>> main scheduler, how we could avoid main scheduler to free
>>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>>>> enter to function amdgpu_device_gpu_recover?
>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> in this case you are on an old code branch.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Jobs are freed now by the main scheduler thread and only
>>>>>>>>>>>>> if no
>>>>>>>>>>>>>>>>>> timeout handler is running.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> See this patch here:
>>>>>>>>>>>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>>>>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>>>>           Please refer to follow log, when it enter to
>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>>>>>>>>>>>> function, the bad job 000000005086879e is freeing in
>>>>>>>>>>>>> function
>>>>>>>>>>>>>>>>>> amdgpu_job_free_cb at the same time, because of the
>>>>>>>>>>>>> hardware fence
>>>>>>>>>>>>>>>> signal.
>>>>>>>>>>>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this case,
>>>>>>>>>>>>>>>>>> the s_fence is already freed, but job is not freed in 
>>>>>>>>>>>>>>>>>> time.
>>>>>>>>>>>>> Then this issue
>>>>>>>>>>>>>> occurs.
>>>>>>>>>>>>>>>>>>> [ 449.792189] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>>>>>>> *ERROR* ring
>>>>>>>>>>>>>>>> sdma0
>>>>>>>>>>>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [
>>>>>>>>>>>>>>>>>>> 449.793202] [drm:amdgpu_job_timedout [amdgpu]]
>>>>> *ERROR*
>>>>>>>>>>>>>>>>>>> Process
>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>> process pid 0 thread pid 0, s_job:000000005086879e [
>>>>>>>>>>>>> 449.794163]
>>>>>>>>>>>>>>>>>> amdgpu
>>>>>>>>>>>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>>>>>> [ 449.794175] Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>>>>>> 449.794221]
>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:0000000066eb74ab [ 449.794222]
>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000d4438ad9 [ 449.794255]
>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000b6d69c65 [ 449.794257]
>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>> thread pid 0,
>>>>>>>>>>>>>>>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>>>>> information: process pid 0 thread pid 0,
>>>>>>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>>>>>>>>>>>> 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>>>>>>> dereference
>>>>>>>>>>>>>>>>>> at
>>>>>>>>>>>>>>>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0
>>>>> [  449.801040]
>>>>>>>>>>>>> Oops:
>>>>>>>>>>>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>>>>>>>>>>>> [ 449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 Tainted:
>>>>>>>>>>>>> G OE
>>>>>>>>>>>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>>>>>>>>>>>> [ 449.802157] Hardware name: QEMU Standard PC (i440FX
>>>>> +
>>>>>>>>>>>>> PIIX,
>>>>>>>>>>>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [
>>>>>>>>>>>>>>>>>>> 449.802944]
>>>>>>>>>>>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched]
>>>>> [
>>>>>>>>>>>>>>>>>>> 449.803488]
>>>>>>>>>>>>>>>> RIP:
>>>>>>>>>>>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>>>>>>>>>>>> [ 449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 0f 85
>>>>>>>>>>>>> 56 ff ff
>>>>>>>>>>>>>>>>>>> ff
>>>>>>>>>>>>>>>>>>> 45 85 e4 0f
>>>>>>>>>>>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 00 48
>>>>>>>>>>>>> 8b 40 10
>>>>>>>>>>>>>>>>>> <48> 8b
>>>>>>>>>>>>>>>> 98
>>>>>>>>>>>>>>>>>> c0 00 00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43
>>>>>>>>>>>>> 48 a8 01
>>>>>>>>>>>>>>>>>>> [ 449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS:
>>>>>>>>>>>>> 00010286 [
>>>>>>>>>>>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX:
>>>>> 0000000000000000
>>>>>>>>>>>>> RCX:
>>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0
>>>>> RSI:
>>>>>>>>>>>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [ 449.807224]
>>>>> RBP:
>>>>>>>>>>>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>>> 449.807818] R10: 0000000000000000 R11:
>>>>> 0000000000000148
>>>>>>>>>>>>> R12:
>>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0
>>>>> R14:
>>>>>>>>>>>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [ 449.809004] 
>>>>>>>>>>>>>>>>>>> FS:
>>>>>>>>>>>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>>>>>>>>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 0000
>>>>>>>>>>>>> ES: 0000
>>>>>>>>>>>>>> CR0:
>>>>>>>>>>>>>>>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0
>>>>> CR3:
>>>>>>>>>>>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [ 449.810747]
>>>>> DR0:
>>>>>>>>>>>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>> 449.811344] DR3: 0000000000000000 DR6:
>>>>> 00000000fffe0ff0 DR7:
>>>>>>>>>>>>>>>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>>>>>>>>>>>>> [ 449.812206] amdgpu_job_timedout+0x114/0x140
>>>>> [amdgpu]
>>>>>>>>>>>>>>>>>>> [ 449.812635] drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>>>>>> [amd_sched] [ 449.813139]  ?
>>>>>>>>>>>>>>>>>>> amdgpu_cgs_destroy_device+0x10/0x10
>>>>>>>>>>>>> [amdgpu] [
>>>>>>>>>>>>>>>>>>> 449.813609] ? drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>> [amd_sched] [
>>>>>>>>>>>>>>>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [ 449.814417]
>>>>>>>>>>>>>>>>>>> worker_thread+0x34/0x410 [ 449.814728]
>>>>>>>>>>>>> kthread+0x121/0x140 [
>>>>>>>>>>>>>>>>>>> 449.815004] ? process_one_work+0x3f0/0x3f0 [
>>>>>>>>>>>>> 449.815374]  ?
>>>>>>>>>>>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>>>>>>>>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>>>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> I can't follow how it would be possible for
>>>>>>>>>>>>> job->s_fence to be
>>>>>>>>>>>>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> So it looks like this patch is just papering over some
>>>>>>>>>>>>> bigger issues.
>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>>>>>>>>>>>>>>>> pointer
>>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>>>>>>>>>>>> You need to give me at least enough time to wake up
>>>>>>>>>>>>>>>>>>>>>> :)
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>> From: amd-gfx
>>>>>>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>>>>>>>>>> Behalf
>>>>>>>>>>>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>> amd- gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>>> From: Christian König
>>>>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>>>>>>>>>>>> When the job is already signaled, the s_fence is
>>>>>>>>>>>>> freed.
>>>>>>>>>>>>>>>>>>>>>>>>>> Then it will has null pointer in
>>>>>>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the job
>>>>>>>>>>>>> is destroyed.
>>>>>>>>>>>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>>>>>>>>>>>> I know it is set to NULL in drm_sched_job_cleanup.
>>>>>>>>>>>>> But in one
>>>>>>>>>>>>>>>>>>>>>>>> case, when it enter into the
>>>>>>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>>>>>>>>>>>>>>>>>> already in drm_sched_job_cleanup, and at this 
>>>>>>>>>>>>>>>>>>>>>>>> time,
>>>>>>>>>>>>> it will
>>>>>>>>>>>>>>>>>>>>>>>> go to free
>>>>>>>>>>>>>>>>>> job.
>>>>>>>>>>>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is
>>>>>>>>>>>>> faster. At
>>>>>>>>>>>>>>>>>>>>>>>> that time, job is not freed, but s_fence is 
>>>>>>>>>>>>>>>>>>>>>>>> already
>>>>>>>>>>>>> NULL.
>>>>>>>>>>>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>>>>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>>>>>>>>>>> kfree(job);
>>>>>>>>>>>>>>>>>>>>>> The job itself is freed up directly after freeing 
>>>>>>>>>>>>>>>>>>>>>> the
>>>>>>>>>>>>> reference
>>>>>>>>>>>>>>>>>>>>>> to the
>>>>>>>>>>>>>>>>>> s_fence.
>>>>>>>>>>>>>>>>>>>>>> So you are just papering over a much bigger problem
>>>>>>>>>>>>> here. This
>>>>>>>>>>>>>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> When you see a job without an s_fence then that
>>>>>>>>>>>>> means the
>>>>>>>>>>>>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> |  2
>>>>>>>>>>>>>>>>>>>>>>>>>> +- drivers/gpu/drm/scheduler/sched_main.c |
>>>>>>>>>>>>>>>>>>>>>>>>>> 11
>>>>>>>>>>>>> ++++++---
>>>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>>>>>>>>>>>>> 2 files changed, 7 insertions(+), 6
>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>> +++
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>>              * job->base holds a reference to
>>>>>>>>>>>>> parent fence
>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>>>>>>>>>>>>>> &&
>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence-
>>>>>> parent))
>>>>>>>>>>>>>>>>>>>>>>>>>> job_signaled = true;
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>>>>>>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(entity, tmp,
>>>>>>>>>>>>>>>> &rq-
>>>>>>>>>>>>>>>>>>>>> entities,
>>>>>>>>>>>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>>>>>>>>>>>> - if
>>>>>>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>>>>>>>>>>>>>> ==
>>>>>>>>>>>>>>>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>>>>>>>>>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>>>>>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>>>>>>>>>>> bad->sched-
>>>>>>>>>>>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>>> (entity-
>>>>>>>>>>>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>> * This iteration is thread safe as
>>>>>>>>>>>>> sched thread
>>>>>>>>>>>>>>>>>>>>>>>>>> is
>>>>>>>>>>>>>>>> stopped.
>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>> +          if (s_job->s_fence &&
>>>>>>>>>>>>> s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>>>>>>>>>> @@ -
>>>>>>>>>>>>>>>>>>>> 395,7
>>>>>>>>>>>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>> * Job is still alive so fence refcount at
>>>>>>>>>>>>>>>> least 1
>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>>> + if (s_job->s_fence)
>>>>>>>>>>>>>>>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>>>>>>>>>>>>>>> finished,
>>>>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>>> /*
>>>>>>>>>>>>>>>>>>>>>>>>>> * We must keep bad job alive for later
>>>>>>>>>>>>>>>> use
>>>>>>>>>>>>>>>>>>>> during @@
>>>>>>>>>>>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>> *sched,
>>>>>>>>>>>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>>>>>>>>>>> * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>>>>>>>>>>>              */
>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>>>>>>>>>> - struct dma_fence *fence =
>>>>>>>>>>>>> s_job->s_fence->parent;
>>>>>>>>>>>>>>>>>>>>>>>>>> + struct dma_fence *fence =
>>>>>>>>>>>>> s_job->s_fence ?
>>>>>>>>>>>>>>>>>>>>>>>>>> + s_job-
>>>>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>> _______________________________________________
>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx mailing list amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>> _______________________________________________
>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>> _______________________________________________
>>>>>>> amd-gfx mailing list
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-18 17:01                                                                                                                 ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-18 17:01 UTC (permalink / raw)
  To: Christian König, Deng, Emily,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

The documentation states it can be used safely with concurrent 
list_del_init so I assume it's true - but I think my even bigger mistake 
is that without locking i just do list_first_entry right after 
list_empty_careful and by this can grab pointer to the same job as 
concurrent drm_sched_job_timedout->list_first_entry_or_null - so yes I 
see now i have to use locking as you advised there and then i don't need 
the list_empty_careful.

Andrey

On 11/18/19 11:44 AM, Christian König wrote:
> list_empty_careful() should only be used for optimizing cases, but 
> never if you need to rely on the result.
>
> The problem is that the function doesn't has any memory barriers 
> whatsoever, it just checks if the next and prev pointer are both empty 
> instead of just the next pointer.
>
> Christian.
>
> Am 18.11.19 um 17:23 schrieb Andrey Grodzovsky:
>> Can you explain why ? As I see it - list_empty_careful is 
>> specifically designed for the case where the only other concurrent 
>> operation in progress is list_del_init 
>> (https://www.kernel.org/doc/htmldocs/kernel-api/API-list-empty-careful.html) 
>> - which is exactly what happens in this patch, no other list altering 
>> operation can take place concurrently - so it looks safe to use for me.
>>
>> Andrey
>>
>> On 11/18/19 11:16 AM, Christian König wrote:
>>> Hi Andrey,
>>>
>>> the only thing which doesn't looks so good is the switch to 
>>> list_empty_careful in drm_sched_cleanup_jobs.
>>>
>>> We either take the lock here or we don't, but please not that extra 
>>> checking.
>>>
>>> Christian.
>>>
>>> Am 18.11.19 um 15:07 schrieb Andrey Grodzovsky:
>>>> Thanks Emily.
>>>>
>>>> Christan - ping for review.
>>>>
>>>> Andrey
>>>>
>>>> On 11/14/19 11:39 PM, Deng, Emily wrote:
>>>>> Hi Andrey,
>>>>>       Currently, I am busying with another issue, maybe will try 
>>>>> next week.
>>>>>
>>>>> Best wishes
>>>>> Emily Deng
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>> Sent: Friday, November 15, 2019 6:14 AM
>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>
>>>>>> Attached.
>>>>>>
>>>>>> Emily - can you give it a try ?
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>> On 11/14/19 3:12 AM, Christian König wrote:
>>>>>>>> What about instead of peeking at the job to actually remove it 
>>>>>>>> from
>>>>>>>> ring_mirror_list right there,
>>>>>>> Also an interesting idea. We would need to protect the mirror list
>>>>>>> with a lock again, but that should be the lesser evil.
>>>>>>>
>>>>>>> Maybe prototype that and see if it works or not.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>>>>>>>
>>>>>>>> On 11/13/19 9:20 AM, Christian König wrote:
>>>>>>>>> Another more fundamental question: Could we get rid of the 
>>>>>>>>> timeout
>>>>>>>>> job at all?
>>>>>>>>
>>>>>>>> There are other stuff there besides picking the first 
>>>>>>>> unfinished job
>>>>>>>> which is common for all the drivers - such as freeing guilty 
>>>>>>>> signaled
>>>>>>>> job and rearming the timeout work timer.
>>>>>>>>
>>>>>>>>
>>>>>>>>> I mean we used to give this as parameter to the scheduler 
>>>>>>>>> callback
>>>>>>>>> because we had the timeout worker in the job, but that is no 
>>>>>>>>> longer
>>>>>>>>> the case.
>>>>>>>>>
>>>>>>>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>>>>>>>          job = 
>>>>>>>>>> list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>>>>>                                         struct drm_sched_job, 
>>>>>>>>>> node);
>>>>>>>>> Why don't we just remove that here and only get the first job 
>>>>>>>>> after
>>>>>>>>> we have stopped the scheduler?
>>>>>>>>
>>>>>>>> Should be ok since we have the extra check for 
>>>>>>>> __kthread_should_park
>>>>>>>> in drm_sched_cleanup_jobs which will protect us in this case 
>>>>>>>> from a
>>>>>>>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs
>>>>>>>> after we already parked it. The problem here is we need the
>>>>>>>> drm_sched_job to access the private data for each client driver 
>>>>>>>> (see
>>>>>>>> amdgpu_job_timedout for example). What about instead of peeking at
>>>>>>>> the job to actually remove it from ring_mirror_list right 
>>>>>>>> there, go
>>>>>>>> ahead with it through the reset routine, if it's signaled in the
>>>>>>>> meanwhile that great - release it, otherwise put it back into
>>>>>>>> ring_mirror_list in drm_sched_resubmit_jobs.
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>>>>>>> This why I asked for a trace with timer enabled, but since 
>>>>>>>>>> there is
>>>>>>>>>> a finite number of places we touch the timer Emily can just put
>>>>>>>>>> prints there. Also, I wonder if this temp fix helps her with the
>>>>>>>>>> issue or not.
>>>>>>>>>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>>>>>>>> The question is where do we rearm the timer for this problem to
>>>>>>>>>>> occur?
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>>>>>>> I was able to reproduce the crash by using the attached
>>>>>>>>>>>> simulate_crash.patch - waiting on guilty job to signal in 
>>>>>>>>>>>> reset
>>>>>>>>>>>> work and artificially rearming the timeout timer just 
>>>>>>>>>>>> before the
>>>>>>>>>>>> check for !cancel_delayed_work(&sched->work_tdr) in
>>>>>>>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. 
>>>>>>>>>>>> This I
>>>>>>>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>>>>>>>
>>>>>>>>>>>> basic_fix.patch handles this by testing whether another timer
>>>>>>>>>>>> already armed ob this scheduler or is there a timeout work in
>>>>>>>>>>>> execution right now (see documentation for work_busy) - 
>>>>>>>>>>>> obviously
>>>>>>>>>>>> this is not a full solution as this will not protect from 
>>>>>>>>>>>> races
>>>>>>>>>>>> if for example there is immediate work scheduling such as in
>>>>>>>>>>>> drm_sched_fault -  so we probably need to account for this by
>>>>>>>>>>>> making drm_sched_cleanup_jobs (at least in the part where it
>>>>>>>>>>>> iterates ring mirror list and frees jobs) and GPU reset really
>>>>>>>>>>>> mutually exclusive and not like now.
>>>>>>>>>>>>
>>>>>>>>>>>> Andrey
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>
>>>>>>>>>>>>> you need to print which scheduler instance is freeing the 
>>>>>>>>>>>>> jobs
>>>>>>>>>>>>> and which one is triggering the reset. The TID and PID is
>>>>>>>>>>>>> completely meaningless here since we are called from 
>>>>>>>>>>>>> different
>>>>>>>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Apart from that I will look into this a bit deeper when I 
>>>>>>>>>>>>> have
>>>>>>>>>>>>> time.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>      I add the follow print in function 
>>>>>>>>>>>>>> drm_sched_cleanup_jobs.
>>>>>>>>>>>>>>  From the log it shows that only use cancel_delayed_work 
>>>>>>>>>>>>>> could
>>>>>>>>>>>>>> not avoid to free job when the sched is in reset. But don’t
>>>>>>>>>>>>>> know exactly where it is wrong about the driver. Do you have
>>>>>>>>>>>>>> any suggestion about this?
>>>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu,
>>>>>>>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>>>>>>>          /*
>>>>>>>>>>>>>>           * Don't destroy jobs while the timeout worker is
>>>>>>>>>>>>>> running  OR thread
>>>>>>>>>>>>>>           * is being parked and hence assumed to not touch
>>>>>>>>>>>>>> ring_mirror_list
>>>>>>>>>>>>>>           */
>>>>>>>>>>>>>>           if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>>>>>>>                  return;
>>>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n",
>>>>>>>>>>>>>> current->tgid, current->pid);
>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>> ring
>>>>>>>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587 Nov 12
>>>>>>>>>>>>>> 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>>>>>>>> Process information: process pid 0 thread pid 0,
>>>>>>>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603 Nov 12 12:58:20
>>>>>>>>>>>>>> ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, 
>>>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253,
>>>>>>>>>>>>>> pid:2253
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, 
>>>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>>>>>>>>> Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, 
>>>>>>>>>>>>>>> Emily
>>>>>>>>>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>> Thinking more about this claim - we assume here that if
>>>>>>>>>>>>>> cancel_delayed_work
>>>>>>>>>>>>>>> returned true it guarantees that timeout work is not 
>>>>>>>>>>>>>>> running
>>>>>>>>>>>>>> but, it merely
>>>>>>>>>>>>>>> means there was a pending timeout work which was removed
>>>>>> from
>>>>>>>>>>>>>>> the workqueue before it's timer elapsed and so it didn't 
>>>>>>>>>>>>>>> have
>>>>>>>>>>>>>>> a
>>>>>>>>>>>>>> chance to be
>>>>>>>>>>>>>>> dequeued and executed, it doesn't cover already executing
>>>>>>>>>>>>>> work. So there is a
>>>>>>>>>>>>>>> possibility where while timeout work started executing 
>>>>>>>>>>>>>>> another
>>>>>>>>>>>>>> timeout work
>>>>>>>>>>>>>>> already got enqueued (maybe through earlier cleanup jobs or
>>>>>>>>>>>>>> through
>>>>>>>>>>>>>>> drm_sched_fault) and if at this point another
>>>>>>>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>>>>>>>>> cancel_delayed_work(&sched->work_tdr) will return true even
>>>>>>>>>>>>>> while there is a
>>>>>>>>>>>>>>> timeout job in progress.
>>>>>>>>>>>>>>> Unfortunately we cannot change cancel_delayed_work to
>>>>>>>>>>>>>>> cancel_delayed_work_sync to flush the timeout work as 
>>>>>>>>>>>>>>> timeout
>>>>>>>>>>>>>> work itself
>>>>>>>>>>>>>>> waits for schedule thread  to be parked again when calling
>>>>>>>>>>>>>> park_thread.
>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> ________________________________________
>>>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on
>>>>>>>>>>>>>> behalf of
>>>>>>>>>>>>>>> Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>> Sent: 08 November 2019 05:35:18
>>>>>>>>>>>>>>> To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> exactly that can't happen. See here:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>           /* Don't destroy jobs while the timeout 
>>>>>>>>>>>>>>>> worker is
>>>>>>>>>>>>>> running */
>>>>>>>>>>>>>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>>>>>>>>>>                   return NULL;
>>>>>>>>>>>>>>> We never free jobs while the timeout working is running to
>>>>>>>>>>>>>> prevent exactly
>>>>>>>>>>>>>>> that issue.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>         The drm_sched_job_timedout-> 
>>>>>>>>>>>>>>>> amdgpu_job_timedout call
>>>>>>>>>>>>>>> amdgpu_device_gpu_recover. I mean the main scheduler 
>>>>>>>>>>>>>>> free the
>>>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>>> in amdgpu_device_gpu_recover, and before calling
>>>>>> drm_sched_stop.
>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>>>>>>>>>>> issue
>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> well who is calling amdgpu_device_gpu_recover() in 
>>>>>>>>>>>>>>>>> this case?
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> When it's not the scheduler we shouldn't have a guilty 
>>>>>>>>>>>>>>>>> job
>>>>>>>>>>>>>> in the first place.
>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>> Hi Chrisitan,
>>>>>>>>>>>>>>>>>>          No, I am with the new branch and also has the
>>>>>>>>>>>>>> patch. Even it
>>>>>>>>>>>>>>>>>> are freed by
>>>>>>>>>>>>>>>>> main scheduler, how we could avoid main scheduler to free
>>>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>>>>> enter to function amdgpu_device_gpu_recover?
>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> in this case you are on an old code branch.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Jobs are freed now by the main scheduler thread and 
>>>>>>>>>>>>>>>>>>> only
>>>>>>>>>>>>>> if no
>>>>>>>>>>>>>>>>>>> timeout handler is running.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> See this patch here:
>>>>>>>>>>>>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>>>>>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>>>>>           Please refer to follow log, when it enter to
>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>>>>>>>>>>>>> function, the bad job 000000005086879e is freeing in
>>>>>>>>>>>>>> function
>>>>>>>>>>>>>>>>>>> amdgpu_job_free_cb at the same time, because of the
>>>>>>>>>>>>>> hardware fence
>>>>>>>>>>>>>>>>> signal.
>>>>>>>>>>>>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this 
>>>>>>>>>>>>>>>>>>> case,
>>>>>>>>>>>>>>>>>>> the s_fence is already freed, but job is not freed 
>>>>>>>>>>>>>>>>>>> in time.
>>>>>>>>>>>>>> Then this issue
>>>>>>>>>>>>>>> occurs.
>>>>>>>>>>>>>>>>>>>> [ 449.792189] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>>>>>>>> *ERROR* ring
>>>>>>>>>>>>>>>>> sdma0
>>>>>>>>>>>>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [
>>>>>>>>>>>>>>>>>>>> 449.793202] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>> *ERROR*
>>>>>>>>>>>>>>>>>>>> Process
>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>>> process pid 0 thread pid 0, s_job:000000005086879e [
>>>>>>>>>>>>>> 449.794163]
>>>>>>>>>>>>>>>>>>> amdgpu
>>>>>>>>>>>>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>>>>>>> [ 449.794175] Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>>>>>>> 449.794221]
>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:0000000066eb74ab [ 449.794222]
>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000d4438ad9 [ 449.794255]
>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000b6d69c65 [ 449.794257]
>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>> thread pid 0,
>>>>>>>>>>>>>>>>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>>>>>> information: process pid 0 thread pid 0,
>>>>>>>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>>>>>>>>>>>>> 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>>>>>>>> dereference
>>>>>>>>>>>>>>>>>>> at
>>>>>>>>>>>>>>>>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0
>>>>>> [  449.801040]
>>>>>>>>>>>>>> Oops:
>>>>>>>>>>>>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>>>>>>>>>>>>> [ 449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 
>>>>>>>>>>>>>>>>>>>> Tainted:
>>>>>>>>>>>>>> G OE
>>>>>>>>>>>>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>>>>>>>>>>>>> [ 449.802157] Hardware name: QEMU Standard PC (i440FX
>>>>>> +
>>>>>>>>>>>>>> PIIX,
>>>>>>>>>>>>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [
>>>>>>>>>>>>>>>>>>>> 449.802944]
>>>>>>>>>>>>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched]
>>>>>> [
>>>>>>>>>>>>>>>>>>>> 449.803488]
>>>>>>>>>>>>>>>>> RIP:
>>>>>>>>>>>>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>>>>>>>>>>>>> [ 449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 
>>>>>>>>>>>>>>>>>>>> 0f 85
>>>>>>>>>>>>>> 56 ff ff
>>>>>>>>>>>>>>>>>>>> ff
>>>>>>>>>>>>>>>>>>>> 45 85 e4 0f
>>>>>>>>>>>>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 
>>>>>>>>>>>>>>>>>>> 00 48
>>>>>>>>>>>>>> 8b 40 10
>>>>>>>>>>>>>>>>>>> <48> 8b
>>>>>>>>>>>>>>>>> 98
>>>>>>>>>>>>>>>>>>> c0 00 00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43
>>>>>>>>>>>>>> 48 a8 01
>>>>>>>>>>>>>>>>>>>> [ 449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS:
>>>>>>>>>>>>>> 00010286 [
>>>>>>>>>>>>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX:
>>>>>> 0000000000000000
>>>>>>>>>>>>>> RCX:
>>>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0
>>>>>> RSI:
>>>>>>>>>>>>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [ 449.807224]
>>>>>> RBP:
>>>>>>>>>>>>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>>>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>>>> 449.807818] R10: 0000000000000000 R11:
>>>>>> 0000000000000148
>>>>>>>>>>>>>> R12:
>>>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0
>>>>>> R14:
>>>>>>>>>>>>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [ 
>>>>>>>>>>>>>>>>>>>> 449.809004] FS:
>>>>>>>>>>>>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>>>>>>>>>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 
>>>>>>>>>>>>>>>>>>>> 0000
>>>>>>>>>>>>>> ES: 0000
>>>>>>>>>>>>>>> CR0:
>>>>>>>>>>>>>>>>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0
>>>>>> CR3:
>>>>>>>>>>>>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [ 449.810747]
>>>>>> DR0:
>>>>>>>>>>>>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>>> 449.811344] DR3: 0000000000000000 DR6:
>>>>>> 00000000fffe0ff0 DR7:
>>>>>>>>>>>>>>>>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>>>>>>>>>>>>>> [ 449.812206] amdgpu_job_timedout+0x114/0x140
>>>>>> [amdgpu]
>>>>>>>>>>>>>>>>>>>> [ 449.812635] drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>>>>>>> [amd_sched] [ 449.813139]  ?
>>>>>>>>>>>>>>>>>>>> amdgpu_cgs_destroy_device+0x10/0x10
>>>>>>>>>>>>>> [amdgpu] [
>>>>>>>>>>>>>>>>>>>> 449.813609] ? drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>> [amd_sched] [
>>>>>>>>>>>>>>>>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [ 449.814417]
>>>>>>>>>>>>>>>>>>>> worker_thread+0x34/0x410 [ 449.814728]
>>>>>>>>>>>>>> kthread+0x121/0x140 [
>>>>>>>>>>>>>>>>>>>> 449.815004] ? process_one_work+0x3f0/0x3f0 [
>>>>>>>>>>>>>> 449.815374]  ?
>>>>>>>>>>>>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>>>>>>>>>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>>>>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> I can't follow how it would be possible for
>>>>>>>>>>>>>> job->s_fence to be
>>>>>>>>>>>>>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> So it looks like this patch is just papering over 
>>>>>>>>>>>>>>>>>>>>> some
>>>>>>>>>>>>>> bigger issues.
>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>>>>>>>>>>>>>>>>> pointer
>>>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>>>>>>>>>>>>> You need to give me at least enough time to wake up
>>>>>>>>>>>>>>>>>>>>>>> :)
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>>> From: amd-gfx
>>>>>>>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>>>>>>>>>>> Behalf
>>>>>>>>>>>>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>>> amd- gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>>>> From: Christian König
>>>>>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>>>>>>>>>>>>> When the job is already signaled, the 
>>>>>>>>>>>>>>>>>>>>>>>>>>> s_fence is
>>>>>>>>>>>>>> freed.
>>>>>>>>>>>>>>>>>>>>>>>>>>> Then it will has null pointer in
>>>>>>>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the 
>>>>>>>>>>>>>>>>>>>>>>>>>> job
>>>>>>>>>>>>>> is destroyed.
>>>>>>>>>>>>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>>>>>>>>>>>>> I know it is set to NULL in 
>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup.
>>>>>>>>>>>>>> But in one
>>>>>>>>>>>>>>>>>>>>>>>>> case, when it enter into the
>>>>>>>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>>>>>>>>>>>>>>>>>>> already in drm_sched_job_cleanup, and at this 
>>>>>>>>>>>>>>>>>>>>>>>>> time,
>>>>>>>>>>>>>> it will
>>>>>>>>>>>>>>>>>>>>>>>>> go to free
>>>>>>>>>>>>>>>>>>> job.
>>>>>>>>>>>>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is
>>>>>>>>>>>>>> faster. At
>>>>>>>>>>>>>>>>>>>>>>>>> that time, job is not freed, but s_fence is 
>>>>>>>>>>>>>>>>>>>>>>>>> already
>>>>>>>>>>>>>> NULL.
>>>>>>>>>>>>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>>>>>>>>>>>> kfree(job);
>>>>>>>>>>>>>>>>>>>>>>> The job itself is freed up directly after 
>>>>>>>>>>>>>>>>>>>>>>> freeing the
>>>>>>>>>>>>>> reference
>>>>>>>>>>>>>>>>>>>>>>> to the
>>>>>>>>>>>>>>>>>>> s_fence.
>>>>>>>>>>>>>>>>>>>>>>> So you are just papering over a much bigger problem
>>>>>>>>>>>>>> here. This
>>>>>>>>>>>>>>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> When you see a job without an s_fence then that
>>>>>>>>>>>>>> means the
>>>>>>>>>>>>>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> |  2
>>>>>>>>>>>>>>>>>>>>>>>>>>> +- drivers/gpu/drm/scheduler/sched_main.c |
>>>>>>>>>>>>>>>>>>>>>>>>>>> 11
>>>>>>>>>>>>>> ++++++---
>>>>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>>>>>>>>>>>>>> 2 files changed, 7 insertions(+), 6
>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>> +++
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>>>              * job->base holds a reference to
>>>>>>>>>>>>>> parent fence
>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>>>>>>>>>>>>>>> &&
>>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence-
>>>>>>> parent))
>>>>>>>>>>>>>>>>>>>>>>>>>>> job_signaled = true;
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>>>>>>>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(entity, tmp,
>>>>>>>>>>>>>>>>> &rq-
>>>>>>>>>>>>>>>>>>>>>> entities,
>>>>>>>>>>>>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>>>>>>>>>>>>> - if
>>>>>>>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>>>>>>>>>>>>>>> ==
>>>>>>>>>>>>>>>>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>>>>>>>>>>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>>>>>>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>>>>>>>>>>>> bad->sched-
>>>>>>>>>>>>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>>>> (entity-
>>>>>>>>>>>>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>> * This iteration is thread safe as
>>>>>>>>>>>>>> sched thread
>>>>>>>>>>>>>>>>>>>>>>>>>>> is
>>>>>>>>>>>>>>>>> stopped.
>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>> +          if (s_job->s_fence &&
>>>>>>>>>>>>>> s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>> @@ -
>>>>>>>>>>>>>>>>>>>>> 395,7
>>>>>>>>>>>>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>>> * Job is still alive so fence refcount at
>>>>>>>>>>>>>>>>> least 1
>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>>>> + if (s_job->s_fence)
>>>>>>>>>>>>>>>>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>>>>>>>>>>>>>>>> finished,
>>>>>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>>>> /*
>>>>>>>>>>>>>>>>>>>>>>>>>>> * We must keep bad job alive for later
>>>>>>>>>>>>>>>>> use
>>>>>>>>>>>>>>>>>>>>> during @@
>>>>>>>>>>>>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>> *sched,
>>>>>>>>>>>>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>>>>>>>>>>>> * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>>>>>>>>>>> - struct dma_fence *fence =
>>>>>>>>>>>>>> s_job->s_fence->parent;
>>>>>>>>>>>>>>>>>>>>>>>>>>> + struct dma_fence *fence =
>>>>>>>>>>>>>> s_job->s_fence ?
>>>>>>>>>>>>>>>>>>>>>>>>>>> + s_job-
>>>>>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>> _______________________________________________
>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx mailing list 
>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>> _______________________________________________
>>>>>>>> amd-gfx mailing list
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-18 17:01                                                                                                                 ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-18 17:01 UTC (permalink / raw)
  To: Christian König, Deng, Emily, amd-gfx

The documentation states it can be used safely with concurrent 
list_del_init so I assume it's true - but I think my even bigger mistake 
is that without locking i just do list_first_entry right after 
list_empty_careful and by this can grab pointer to the same job as 
concurrent drm_sched_job_timedout->list_first_entry_or_null - so yes I 
see now i have to use locking as you advised there and then i don't need 
the list_empty_careful.

Andrey

On 11/18/19 11:44 AM, Christian König wrote:
> list_empty_careful() should only be used for optimizing cases, but 
> never if you need to rely on the result.
>
> The problem is that the function doesn't has any memory barriers 
> whatsoever, it just checks if the next and prev pointer are both empty 
> instead of just the next pointer.
>
> Christian.
>
> Am 18.11.19 um 17:23 schrieb Andrey Grodzovsky:
>> Can you explain why ? As I see it - list_empty_careful is 
>> specifically designed for the case where the only other concurrent 
>> operation in progress is list_del_init 
>> (https://www.kernel.org/doc/htmldocs/kernel-api/API-list-empty-careful.html) 
>> - which is exactly what happens in this patch, no other list altering 
>> operation can take place concurrently - so it looks safe to use for me.
>>
>> Andrey
>>
>> On 11/18/19 11:16 AM, Christian König wrote:
>>> Hi Andrey,
>>>
>>> the only thing which doesn't looks so good is the switch to 
>>> list_empty_careful in drm_sched_cleanup_jobs.
>>>
>>> We either take the lock here or we don't, but please not that extra 
>>> checking.
>>>
>>> Christian.
>>>
>>> Am 18.11.19 um 15:07 schrieb Andrey Grodzovsky:
>>>> Thanks Emily.
>>>>
>>>> Christan - ping for review.
>>>>
>>>> Andrey
>>>>
>>>> On 11/14/19 11:39 PM, Deng, Emily wrote:
>>>>> Hi Andrey,
>>>>>       Currently, I am busying with another issue, maybe will try 
>>>>> next week.
>>>>>
>>>>> Best wishes
>>>>> Emily Deng
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>> Sent: Friday, November 15, 2019 6:14 AM
>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>
>>>>>> Attached.
>>>>>>
>>>>>> Emily - can you give it a try ?
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>> On 11/14/19 3:12 AM, Christian König wrote:
>>>>>>>> What about instead of peeking at the job to actually remove it 
>>>>>>>> from
>>>>>>>> ring_mirror_list right there,
>>>>>>> Also an interesting idea. We would need to protect the mirror list
>>>>>>> with a lock again, but that should be the lesser evil.
>>>>>>>
>>>>>>> Maybe prototype that and see if it works or not.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>>>>>>>
>>>>>>>> On 11/13/19 9:20 AM, Christian König wrote:
>>>>>>>>> Another more fundamental question: Could we get rid of the 
>>>>>>>>> timeout
>>>>>>>>> job at all?
>>>>>>>>
>>>>>>>> There are other stuff there besides picking the first 
>>>>>>>> unfinished job
>>>>>>>> which is common for all the drivers - such as freeing guilty 
>>>>>>>> signaled
>>>>>>>> job and rearming the timeout work timer.
>>>>>>>>
>>>>>>>>
>>>>>>>>> I mean we used to give this as parameter to the scheduler 
>>>>>>>>> callback
>>>>>>>>> because we had the timeout worker in the job, but that is no 
>>>>>>>>> longer
>>>>>>>>> the case.
>>>>>>>>>
>>>>>>>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>>>>>>>          job = 
>>>>>>>>>> list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>>>>>                                         struct drm_sched_job, 
>>>>>>>>>> node);
>>>>>>>>> Why don't we just remove that here and only get the first job 
>>>>>>>>> after
>>>>>>>>> we have stopped the scheduler?
>>>>>>>>
>>>>>>>> Should be ok since we have the extra check for 
>>>>>>>> __kthread_should_park
>>>>>>>> in drm_sched_cleanup_jobs which will protect us in this case 
>>>>>>>> from a
>>>>>>>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs
>>>>>>>> after we already parked it. The problem here is we need the
>>>>>>>> drm_sched_job to access the private data for each client driver 
>>>>>>>> (see
>>>>>>>> amdgpu_job_timedout for example). What about instead of peeking at
>>>>>>>> the job to actually remove it from ring_mirror_list right 
>>>>>>>> there, go
>>>>>>>> ahead with it through the reset routine, if it's signaled in the
>>>>>>>> meanwhile that great - release it, otherwise put it back into
>>>>>>>> ring_mirror_list in drm_sched_resubmit_jobs.
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>>>>>>> This why I asked for a trace with timer enabled, but since 
>>>>>>>>>> there is
>>>>>>>>>> a finite number of places we touch the timer Emily can just put
>>>>>>>>>> prints there. Also, I wonder if this temp fix helps her with the
>>>>>>>>>> issue or not.
>>>>>>>>>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>>>>>>>> The question is where do we rearm the timer for this problem to
>>>>>>>>>>> occur?
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>>>>>>> I was able to reproduce the crash by using the attached
>>>>>>>>>>>> simulate_crash.patch - waiting on guilty job to signal in 
>>>>>>>>>>>> reset
>>>>>>>>>>>> work and artificially rearming the timeout timer just 
>>>>>>>>>>>> before the
>>>>>>>>>>>> check for !cancel_delayed_work(&sched->work_tdr) in
>>>>>>>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. 
>>>>>>>>>>>> This I
>>>>>>>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>>>>>>>
>>>>>>>>>>>> basic_fix.patch handles this by testing whether another timer
>>>>>>>>>>>> already armed ob this scheduler or is there a timeout work in
>>>>>>>>>>>> execution right now (see documentation for work_busy) - 
>>>>>>>>>>>> obviously
>>>>>>>>>>>> this is not a full solution as this will not protect from 
>>>>>>>>>>>> races
>>>>>>>>>>>> if for example there is immediate work scheduling such as in
>>>>>>>>>>>> drm_sched_fault -  so we probably need to account for this by
>>>>>>>>>>>> making drm_sched_cleanup_jobs (at least in the part where it
>>>>>>>>>>>> iterates ring mirror list and frees jobs) and GPU reset really
>>>>>>>>>>>> mutually exclusive and not like now.
>>>>>>>>>>>>
>>>>>>>>>>>> Andrey
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>
>>>>>>>>>>>>> you need to print which scheduler instance is freeing the 
>>>>>>>>>>>>> jobs
>>>>>>>>>>>>> and which one is triggering the reset. The TID and PID is
>>>>>>>>>>>>> completely meaningless here since we are called from 
>>>>>>>>>>>>> different
>>>>>>>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Apart from that I will look into this a bit deeper when I 
>>>>>>>>>>>>> have
>>>>>>>>>>>>> time.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>      I add the follow print in function 
>>>>>>>>>>>>>> drm_sched_cleanup_jobs.
>>>>>>>>>>>>>>  From the log it shows that only use cancel_delayed_work 
>>>>>>>>>>>>>> could
>>>>>>>>>>>>>> not avoid to free job when the sched is in reset. But don’t
>>>>>>>>>>>>>> know exactly where it is wrong about the driver. Do you have
>>>>>>>>>>>>>> any suggestion about this?
>>>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu,
>>>>>>>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>>>>>>>          /*
>>>>>>>>>>>>>>           * Don't destroy jobs while the timeout worker is
>>>>>>>>>>>>>> running  OR thread
>>>>>>>>>>>>>>           * is being parked and hence assumed to not touch
>>>>>>>>>>>>>> ring_mirror_list
>>>>>>>>>>>>>>           */
>>>>>>>>>>>>>>           if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>>>>>>>                  return;
>>>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n",
>>>>>>>>>>>>>> current->tgid, current->pid);
>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>> ring
>>>>>>>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587 Nov 12
>>>>>>>>>>>>>> 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>>>>>>>> Process information: process pid 0 thread pid 0,
>>>>>>>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603 Nov 12 12:58:20
>>>>>>>>>>>>>> ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, 
>>>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253,
>>>>>>>>>>>>>> pid:2253
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process information:
>>>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, 
>>>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>>>>>>>>> Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, 
>>>>>>>>>>>>>>> Emily
>>>>>>>>>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>> Thinking more about this claim - we assume here that if
>>>>>>>>>>>>>> cancel_delayed_work
>>>>>>>>>>>>>>> returned true it guarantees that timeout work is not 
>>>>>>>>>>>>>>> running
>>>>>>>>>>>>>> but, it merely
>>>>>>>>>>>>>>> means there was a pending timeout work which was removed
>>>>>> from
>>>>>>>>>>>>>>> the workqueue before it's timer elapsed and so it didn't 
>>>>>>>>>>>>>>> have
>>>>>>>>>>>>>>> a
>>>>>>>>>>>>>> chance to be
>>>>>>>>>>>>>>> dequeued and executed, it doesn't cover already executing
>>>>>>>>>>>>>> work. So there is a
>>>>>>>>>>>>>>> possibility where while timeout work started executing 
>>>>>>>>>>>>>>> another
>>>>>>>>>>>>>> timeout work
>>>>>>>>>>>>>>> already got enqueued (maybe through earlier cleanup jobs or
>>>>>>>>>>>>>> through
>>>>>>>>>>>>>>> drm_sched_fault) and if at this point another
>>>>>>>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>>>>>>>>> cancel_delayed_work(&sched->work_tdr) will return true even
>>>>>>>>>>>>>> while there is a
>>>>>>>>>>>>>>> timeout job in progress.
>>>>>>>>>>>>>>> Unfortunately we cannot change cancel_delayed_work to
>>>>>>>>>>>>>>> cancel_delayed_work_sync to flush the timeout work as 
>>>>>>>>>>>>>>> timeout
>>>>>>>>>>>>>> work itself
>>>>>>>>>>>>>>> waits for schedule thread  to be parked again when calling
>>>>>>>>>>>>>> park_thread.
>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> ________________________________________
>>>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on
>>>>>>>>>>>>>> behalf of
>>>>>>>>>>>>>>> Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>> Sent: 08 November 2019 05:35:18
>>>>>>>>>>>>>>> To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue
>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> exactly that can't happen. See here:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>           /* Don't destroy jobs while the timeout 
>>>>>>>>>>>>>>>> worker is
>>>>>>>>>>>>>> running */
>>>>>>>>>>>>>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>>>>>>>>>>                   return NULL;
>>>>>>>>>>>>>>> We never free jobs while the timeout working is running to
>>>>>>>>>>>>>> prevent exactly
>>>>>>>>>>>>>>> that issue.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>         The drm_sched_job_timedout-> 
>>>>>>>>>>>>>>>> amdgpu_job_timedout call
>>>>>>>>>>>>>>> amdgpu_device_gpu_recover. I mean the main scheduler 
>>>>>>>>>>>>>>> free the
>>>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>>> in amdgpu_device_gpu_recover, and before calling
>>>>>> drm_sched_stop.
>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>>>>>>>>>>> issue
>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> well who is calling amdgpu_device_gpu_recover() in 
>>>>>>>>>>>>>>>>> this case?
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> When it's not the scheduler we shouldn't have a guilty 
>>>>>>>>>>>>>>>>> job
>>>>>>>>>>>>>> in the first place.
>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>> Hi Chrisitan,
>>>>>>>>>>>>>>>>>>          No, I am with the new branch and also has the
>>>>>>>>>>>>>> patch. Even it
>>>>>>>>>>>>>>>>>> are freed by
>>>>>>>>>>>>>>>>> main scheduler, how we could avoid main scheduler to free
>>>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>>>>> enter to function amdgpu_device_gpu_recover?
>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> in this case you are on an old code branch.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Jobs are freed now by the main scheduler thread and 
>>>>>>>>>>>>>>>>>>> only
>>>>>>>>>>>>>> if no
>>>>>>>>>>>>>>>>>>> timeout handler is running.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> See this patch here:
>>>>>>>>>>>>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>>>>>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>>>>>           Please refer to follow log, when it enter to
>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>>>>>>>>>>>>> function, the bad job 000000005086879e is freeing in
>>>>>>>>>>>>>> function
>>>>>>>>>>>>>>>>>>> amdgpu_job_free_cb at the same time, because of the
>>>>>>>>>>>>>> hardware fence
>>>>>>>>>>>>>>>>> signal.
>>>>>>>>>>>>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this 
>>>>>>>>>>>>>>>>>>> case,
>>>>>>>>>>>>>>>>>>> the s_fence is already freed, but job is not freed 
>>>>>>>>>>>>>>>>>>> in time.
>>>>>>>>>>>>>> Then this issue
>>>>>>>>>>>>>>> occurs.
>>>>>>>>>>>>>>>>>>>> [ 449.792189] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>>>>>>>> *ERROR* ring
>>>>>>>>>>>>>>>>> sdma0
>>>>>>>>>>>>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [
>>>>>>>>>>>>>>>>>>>> 449.793202] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>> *ERROR*
>>>>>>>>>>>>>>>>>>>> Process
>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>>> process pid 0 thread pid 0, s_job:000000005086879e [
>>>>>>>>>>>>>> 449.794163]
>>>>>>>>>>>>>>>>>>> amdgpu
>>>>>>>>>>>>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>>>>>>> [ 449.794175] Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>>>>>>> 449.794221]
>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:0000000066eb74ab [ 449.794222]
>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000d4438ad9 [ 449.794255]
>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000b6d69c65 [ 449.794257]
>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>> thread pid 0,
>>>>>>>>>>>>>>>>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>>>>>> information: process pid 0 thread pid 0,
>>>>>>>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>>>>>>>>>>>>> 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>>>>>>>> dereference
>>>>>>>>>>>>>>>>>>> at
>>>>>>>>>>>>>>>>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0
>>>>>> [  449.801040]
>>>>>>>>>>>>>> Oops:
>>>>>>>>>>>>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>>>>>>>>>>>>> [ 449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 
>>>>>>>>>>>>>>>>>>>> Tainted:
>>>>>>>>>>>>>> G OE
>>>>>>>>>>>>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>>>>>>>>>>>>> [ 449.802157] Hardware name: QEMU Standard PC (i440FX
>>>>>> +
>>>>>>>>>>>>>> PIIX,
>>>>>>>>>>>>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [
>>>>>>>>>>>>>>>>>>>> 449.802944]
>>>>>>>>>>>>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched]
>>>>>> [
>>>>>>>>>>>>>>>>>>>> 449.803488]
>>>>>>>>>>>>>>>>> RIP:
>>>>>>>>>>>>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>>>>>>>>>>>>> [ 449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 
>>>>>>>>>>>>>>>>>>>> 0f 85
>>>>>>>>>>>>>> 56 ff ff
>>>>>>>>>>>>>>>>>>>> ff
>>>>>>>>>>>>>>>>>>>> 45 85 e4 0f
>>>>>>>>>>>>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 
>>>>>>>>>>>>>>>>>>> 00 48
>>>>>>>>>>>>>> 8b 40 10
>>>>>>>>>>>>>>>>>>> <48> 8b
>>>>>>>>>>>>>>>>> 98
>>>>>>>>>>>>>>>>>>> c0 00 00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43
>>>>>>>>>>>>>> 48 a8 01
>>>>>>>>>>>>>>>>>>>> [ 449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS:
>>>>>>>>>>>>>> 00010286 [
>>>>>>>>>>>>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX:
>>>>>> 0000000000000000
>>>>>>>>>>>>>> RCX:
>>>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0
>>>>>> RSI:
>>>>>>>>>>>>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [ 449.807224]
>>>>>> RBP:
>>>>>>>>>>>>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>>>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>>>> 449.807818] R10: 0000000000000000 R11:
>>>>>> 0000000000000148
>>>>>>>>>>>>>> R12:
>>>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0
>>>>>> R14:
>>>>>>>>>>>>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [ 
>>>>>>>>>>>>>>>>>>>> 449.809004] FS:
>>>>>>>>>>>>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>>>>>>>>>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 
>>>>>>>>>>>>>>>>>>>> 0000
>>>>>>>>>>>>>> ES: 0000
>>>>>>>>>>>>>>> CR0:
>>>>>>>>>>>>>>>>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0
>>>>>> CR3:
>>>>>>>>>>>>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [ 449.810747]
>>>>>> DR0:
>>>>>>>>>>>>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>>> 449.811344] DR3: 0000000000000000 DR6:
>>>>>> 00000000fffe0ff0 DR7:
>>>>>>>>>>>>>>>>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>>>>>>>>>>>>>> [ 449.812206] amdgpu_job_timedout+0x114/0x140
>>>>>> [amdgpu]
>>>>>>>>>>>>>>>>>>>> [ 449.812635] drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>>>>>>> [amd_sched] [ 449.813139]  ?
>>>>>>>>>>>>>>>>>>>> amdgpu_cgs_destroy_device+0x10/0x10
>>>>>>>>>>>>>> [amdgpu] [
>>>>>>>>>>>>>>>>>>>> 449.813609] ? drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>> [amd_sched] [
>>>>>>>>>>>>>>>>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [ 449.814417]
>>>>>>>>>>>>>>>>>>>> worker_thread+0x34/0x410 [ 449.814728]
>>>>>>>>>>>>>> kthread+0x121/0x140 [
>>>>>>>>>>>>>>>>>>>> 449.815004] ? process_one_work+0x3f0/0x3f0 [
>>>>>>>>>>>>>> 449.815374]  ?
>>>>>>>>>>>>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>>>>>>>>>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>>>>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> I can't follow how it would be possible for
>>>>>>>>>>>>>> job->s_fence to be
>>>>>>>>>>>>>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> So it looks like this patch is just papering over 
>>>>>>>>>>>>>>>>>>>>> some
>>>>>>>>>>>>>> bigger issues.
>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>>>>>>>>>>>>>>>>> pointer
>>>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>>>>>>>>>>>>> You need to give me at least enough time to wake up
>>>>>>>>>>>>>>>>>>>>>>> :)
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>>> From: amd-gfx
>>>>>>>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>>>>>>>>>>> Behalf
>>>>>>>>>>>>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>>> amd- gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>>>> From: Christian König
>>>>>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>>>>>>>>>>>>> When the job is already signaled, the 
>>>>>>>>>>>>>>>>>>>>>>>>>>> s_fence is
>>>>>>>>>>>>>> freed.
>>>>>>>>>>>>>>>>>>>>>>>>>>> Then it will has null pointer in
>>>>>>>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when the 
>>>>>>>>>>>>>>>>>>>>>>>>>> job
>>>>>>>>>>>>>> is destroyed.
>>>>>>>>>>>>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>>>>>>>>>>>>> I know it is set to NULL in 
>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup.
>>>>>>>>>>>>>> But in one
>>>>>>>>>>>>>>>>>>>>>>>>> case, when it enter into the
>>>>>>>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>>>>>>>>>>>>>>>>>>> already in drm_sched_job_cleanup, and at this 
>>>>>>>>>>>>>>>>>>>>>>>>> time,
>>>>>>>>>>>>>> it will
>>>>>>>>>>>>>>>>>>>>>>>>> go to free
>>>>>>>>>>>>>>>>>>> job.
>>>>>>>>>>>>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is
>>>>>>>>>>>>>> faster. At
>>>>>>>>>>>>>>>>>>>>>>>>> that time, job is not freed, but s_fence is 
>>>>>>>>>>>>>>>>>>>>>>>>> already
>>>>>>>>>>>>>> NULL.
>>>>>>>>>>>>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_ring_priority_put(ring, s_job->s_priority);
>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>>>>>>>>>>>> kfree(job);
>>>>>>>>>>>>>>>>>>>>>>> The job itself is freed up directly after 
>>>>>>>>>>>>>>>>>>>>>>> freeing the
>>>>>>>>>>>>>> reference
>>>>>>>>>>>>>>>>>>>>>>> to the
>>>>>>>>>>>>>>>>>>> s_fence.
>>>>>>>>>>>>>>>>>>>>>>> So you are just papering over a much bigger problem
>>>>>>>>>>>>>> here. This
>>>>>>>>>>>>>>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> When you see a job without an s_fence then that
>>>>>>>>>>>>>> means the
>>>>>>>>>>>>>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> |  2
>>>>>>>>>>>>>>>>>>>>>>>>>>> +- drivers/gpu/drm/scheduler/sched_main.c |
>>>>>>>>>>>>>>>>>>>>>>>>>>> 11
>>>>>>>>>>>>>> ++++++---
>>>>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>>>>>>>>>>>>>> 2 files changed, 7 insertions(+), 6
>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>> +++
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>>>              * job->base holds a reference to
>>>>>>>>>>>>>> parent fence
>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>>>>>>>>>>>>>>> &&
>>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence-
>>>>>>> parent))
>>>>>>>>>>>>>>>>>>>>>>>>>>> job_signaled = true;
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>>>>>>>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(entity, tmp,
>>>>>>>>>>>>>>>>> &rq-
>>>>>>>>>>>>>>>>>>>>>> entities,
>>>>>>>>>>>>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>>>>>>>>>>>>> - if
>>>>>>>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>>>>>>>>>>>>>>> ==
>>>>>>>>>>>>>>>>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>>>>>>>>>>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>>>>>>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>>>>>>>>>>>> bad->sched-
>>>>>>>>>>>>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>>>> (entity-
>>>>>>>>>>>>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>> * This iteration is thread safe as
>>>>>>>>>>>>>> sched thread
>>>>>>>>>>>>>>>>>>>>>>>>>>> is
>>>>>>>>>>>>>>>>> stopped.
>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>> +          if (s_job->s_fence &&
>>>>>>>>>>>>>> s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>> @@ -
>>>>>>>>>>>>>>>>>>>>> 395,7
>>>>>>>>>>>>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>>> * Job is still alive so fence refcount at
>>>>>>>>>>>>>>>>> least 1
>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>>>> + if (s_job->s_fence)
>>>>>>>>>>>>>>>>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>>>>>>>>>>>>>>>> finished,
>>>>>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>>>> /*
>>>>>>>>>>>>>>>>>>>>>>>>>>> * We must keep bad job alive for later
>>>>>>>>>>>>>>>>> use
>>>>>>>>>>>>>>>>>>>>> during @@
>>>>>>>>>>>>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>> *sched,
>>>>>>>>>>>>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>>>>>>>>>>>> * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>>>>>>>>>>> - struct dma_fence *fence =
>>>>>>>>>>>>>> s_job->s_fence->parent;
>>>>>>>>>>>>>>>>>>>>>>>>>>> + struct dma_fence *fence =
>>>>>>>>>>>>>> s_job->s_fence ?
>>>>>>>>>>>>>>>>>>>>>>>>>>> + s_job-
>>>>>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>> _______________________________________________
>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx mailing list 
>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>> _______________________________________________
>>>>>>>> amd-gfx mailing list
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-18 20:01                                                                                                                     ` Christian König
  0 siblings, 0 replies; 80+ messages in thread
From: Christian König @ 2019-11-18 20:01 UTC (permalink / raw)
  To: Andrey Grodzovsky, Christian König, Deng, Emily,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Well then we should probably update the documentation.

Take a look at the implementation, there is no compiler or SMP barrier 
at all.

Christian.

Am 18.11.19 um 18:01 schrieb Andrey Grodzovsky:
> The documentation states it can be used safely with concurrent 
> list_del_init so I assume it's true - but I think my even bigger 
> mistake is that without locking i just do list_first_entry right after 
> list_empty_careful and by this can grab pointer to the same job as 
> concurrent drm_sched_job_timedout->list_first_entry_or_null - so yes I 
> see now i have to use locking as you advised there and then i don't 
> need the list_empty_careful.
>
> Andrey
>
> On 11/18/19 11:44 AM, Christian König wrote:
>> list_empty_careful() should only be used for optimizing cases, but 
>> never if you need to rely on the result.
>>
>> The problem is that the function doesn't has any memory barriers 
>> whatsoever, it just checks if the next and prev pointer are both 
>> empty instead of just the next pointer.
>>
>> Christian.
>>
>> Am 18.11.19 um 17:23 schrieb Andrey Grodzovsky:
>>> Can you explain why ? As I see it - list_empty_careful is 
>>> specifically designed for the case where the only other concurrent 
>>> operation in progress is list_del_init 
>>> (https://www.kernel.org/doc/htmldocs/kernel-api/API-list-empty-careful.html) 
>>> - which is exactly what happens in this patch, no other list 
>>> altering operation can take place concurrently - so it looks safe to 
>>> use for me.
>>>
>>> Andrey
>>>
>>> On 11/18/19 11:16 AM, Christian König wrote:
>>>> Hi Andrey,
>>>>
>>>> the only thing which doesn't looks so good is the switch to 
>>>> list_empty_careful in drm_sched_cleanup_jobs.
>>>>
>>>> We either take the lock here or we don't, but please not that extra 
>>>> checking.
>>>>
>>>> Christian.
>>>>
>>>> Am 18.11.19 um 15:07 schrieb Andrey Grodzovsky:
>>>>> Thanks Emily.
>>>>>
>>>>> Christan - ping for review.
>>>>>
>>>>> Andrey
>>>>>
>>>>> On 11/14/19 11:39 PM, Deng, Emily wrote:
>>>>>> Hi Andrey,
>>>>>>       Currently, I am busying with another issue, maybe will try 
>>>>>> next week.
>>>>>>
>>>>>> Best wishes
>>>>>> Emily Deng
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>> Sent: Friday, November 15, 2019 6:14 AM
>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>>
>>>>>>> Attached.
>>>>>>>
>>>>>>> Emily - can you give it a try ?
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>> On 11/14/19 3:12 AM, Christian König wrote:
>>>>>>>>> What about instead of peeking at the job to actually remove it 
>>>>>>>>> from
>>>>>>>>> ring_mirror_list right there,
>>>>>>>> Also an interesting idea. We would need to protect the mirror list
>>>>>>>> with a lock again, but that should be the lesser evil.
>>>>>>>>
>>>>>>>> Maybe prototype that and see if it works or not.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>>>>>>>>
>>>>>>>>> On 11/13/19 9:20 AM, Christian König wrote:
>>>>>>>>>> Another more fundamental question: Could we get rid of the 
>>>>>>>>>> timeout
>>>>>>>>>> job at all?
>>>>>>>>>
>>>>>>>>> There are other stuff there besides picking the first 
>>>>>>>>> unfinished job
>>>>>>>>> which is common for all the drivers - such as freeing guilty 
>>>>>>>>> signaled
>>>>>>>>> job and rearming the timeout work timer.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> I mean we used to give this as parameter to the scheduler 
>>>>>>>>>> callback
>>>>>>>>>> because we had the timeout worker in the job, but that is no 
>>>>>>>>>> longer
>>>>>>>>>> the case.
>>>>>>>>>>
>>>>>>>>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>>>>>>>>          job = 
>>>>>>>>>>> list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>>>>>>                                         struct 
>>>>>>>>>>> drm_sched_job, node);
>>>>>>>>>> Why don't we just remove that here and only get the first job 
>>>>>>>>>> after
>>>>>>>>>> we have stopped the scheduler?
>>>>>>>>>
>>>>>>>>> Should be ok since we have the extra check for 
>>>>>>>>> __kthread_should_park
>>>>>>>>> in drm_sched_cleanup_jobs which will protect us in this case 
>>>>>>>>> from a
>>>>>>>>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs
>>>>>>>>> after we already parked it. The problem here is we need the
>>>>>>>>> drm_sched_job to access the private data for each client 
>>>>>>>>> driver (see
>>>>>>>>> amdgpu_job_timedout for example). What about instead of 
>>>>>>>>> peeking at
>>>>>>>>> the job to actually remove it from ring_mirror_list right 
>>>>>>>>> there, go
>>>>>>>>> ahead with it through the reset routine, if it's signaled in the
>>>>>>>>> meanwhile that great - release it, otherwise put it back into
>>>>>>>>> ring_mirror_list in drm_sched_resubmit_jobs.
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>>>>>>>> This why I asked for a trace with timer enabled, but since 
>>>>>>>>>>> there is
>>>>>>>>>>> a finite number of places we touch the timer Emily can just put
>>>>>>>>>>> prints there. Also, I wonder if this temp fix helps her with 
>>>>>>>>>>> the
>>>>>>>>>>> issue or not.
>>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>>>>>>>>> The question is where do we rearm the timer for this 
>>>>>>>>>>>> problem to
>>>>>>>>>>>> occur?
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>>>>>>>> I was able to reproduce the crash by using the attached
>>>>>>>>>>>>> simulate_crash.patch - waiting on guilty job to signal in 
>>>>>>>>>>>>> reset
>>>>>>>>>>>>> work and artificially rearming the timeout timer just 
>>>>>>>>>>>>> before the
>>>>>>>>>>>>> check for !cancel_delayed_work(&sched->work_tdr) in
>>>>>>>>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. 
>>>>>>>>>>>>> This I
>>>>>>>>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>>>>>>>>
>>>>>>>>>>>>> basic_fix.patch handles this by testing whether another timer
>>>>>>>>>>>>> already armed ob this scheduler or is there a timeout work in
>>>>>>>>>>>>> execution right now (see documentation for work_busy) - 
>>>>>>>>>>>>> obviously
>>>>>>>>>>>>> this is not a full solution as this will not protect from 
>>>>>>>>>>>>> races
>>>>>>>>>>>>> if for example there is immediate work scheduling such as in
>>>>>>>>>>>>> drm_sched_fault -  so we probably need to account for this by
>>>>>>>>>>>>> making drm_sched_cleanup_jobs (at least in the part where it
>>>>>>>>>>>>> iterates ring mirror list and frees jobs) and GPU reset 
>>>>>>>>>>>>> really
>>>>>>>>>>>>> mutually exclusive and not like now.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> you need to print which scheduler instance is freeing the 
>>>>>>>>>>>>>> jobs
>>>>>>>>>>>>>> and which one is triggering the reset. The TID and PID is
>>>>>>>>>>>>>> completely meaningless here since we are called from 
>>>>>>>>>>>>>> different
>>>>>>>>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Apart from that I will look into this a bit deeper when I 
>>>>>>>>>>>>>> have
>>>>>>>>>>>>>> time.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>      I add the follow print in function 
>>>>>>>>>>>>>>> drm_sched_cleanup_jobs.
>>>>>>>>>>>>>>>  From the log it shows that only use cancel_delayed_work 
>>>>>>>>>>>>>>> could
>>>>>>>>>>>>>>> not avoid to free job when the sched is in reset. But don’t
>>>>>>>>>>>>>>> know exactly where it is wrong about the driver. Do you 
>>>>>>>>>>>>>>> have
>>>>>>>>>>>>>>> any suggestion about this?
>>>>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu,
>>>>>>>>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>>>>>>>>          /*
>>>>>>>>>>>>>>>           * Don't destroy jobs while the timeout worker is
>>>>>>>>>>>>>>> running  OR thread
>>>>>>>>>>>>>>>           * is being parked and hence assumed to not touch
>>>>>>>>>>>>>>> ring_mirror_list
>>>>>>>>>>>>>>>           */
>>>>>>>>>>>>>>>           if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>>>>>>>>                  return;
>>>>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n",
>>>>>>>>>>>>>>> current->tgid, current->pid);
>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>> ring
>>>>>>>>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587 Nov 12
>>>>>>>>>>>>>>> 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>>>>>>>>> Process information: process pid 0 thread pid 0,
>>>>>>>>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603 Nov 12 12:58:20
>>>>>>>>>>>>>>> ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process 
>>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, 
>>>>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253,
>>>>>>>>>>>>>>> pid:2253
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process 
>>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, 
>>>>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>>>>>>>>>> Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, 
>>>>>>>>>>>>>>>> Emily
>>>>>>>>>>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>>>>>>>>>> issue
>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>> Thinking more about this claim - we assume here that if
>>>>>>>>>>>>>>> cancel_delayed_work
>>>>>>>>>>>>>>>> returned true it guarantees that timeout work is not 
>>>>>>>>>>>>>>>> running
>>>>>>>>>>>>>>> but, it merely
>>>>>>>>>>>>>>>> means there was a pending timeout work which was removed
>>>>>>> from
>>>>>>>>>>>>>>>> the workqueue before it's timer elapsed and so it 
>>>>>>>>>>>>>>>> didn't have
>>>>>>>>>>>>>>>> a
>>>>>>>>>>>>>>> chance to be
>>>>>>>>>>>>>>>> dequeued and executed, it doesn't cover already executing
>>>>>>>>>>>>>>> work. So there is a
>>>>>>>>>>>>>>>> possibility where while timeout work started executing 
>>>>>>>>>>>>>>>> another
>>>>>>>>>>>>>>> timeout work
>>>>>>>>>>>>>>>> already got enqueued (maybe through earlier cleanup 
>>>>>>>>>>>>>>>> jobs or
>>>>>>>>>>>>>>> through
>>>>>>>>>>>>>>>> drm_sched_fault) and if at this point another
>>>>>>>>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>>>>>>>>>> cancel_delayed_work(&sched->work_tdr) will return true 
>>>>>>>>>>>>>>>> even
>>>>>>>>>>>>>>> while there is a
>>>>>>>>>>>>>>>> timeout job in progress.
>>>>>>>>>>>>>>>> Unfortunately we cannot change cancel_delayed_work to
>>>>>>>>>>>>>>>> cancel_delayed_work_sync to flush the timeout work as 
>>>>>>>>>>>>>>>> timeout
>>>>>>>>>>>>>>> work itself
>>>>>>>>>>>>>>>> waits for schedule thread  to be parked again when calling
>>>>>>>>>>>>>>> park_thread.
>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> ________________________________________
>>>>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on
>>>>>>>>>>>>>>> behalf of
>>>>>>>>>>>>>>>> Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>> Sent: 08 November 2019 05:35:18
>>>>>>>>>>>>>>>> To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>>>>>>>>>> issue
>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> exactly that can't happen. See here:
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>           /* Don't destroy jobs while the timeout 
>>>>>>>>>>>>>>>>> worker is
>>>>>>>>>>>>>>> running */
>>>>>>>>>>>>>>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>>>>>>>>>>>                   return NULL;
>>>>>>>>>>>>>>>> We never free jobs while the timeout working is running to
>>>>>>>>>>>>>>> prevent exactly
>>>>>>>>>>>>>>>> that issue.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>>         The drm_sched_job_timedout-> 
>>>>>>>>>>>>>>>>> amdgpu_job_timedout call
>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover. I mean the main scheduler 
>>>>>>>>>>>>>>>> free the
>>>>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>>>> in amdgpu_device_gpu_recover, and before calling
>>>>>>> drm_sched_stop.
>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>>>>>>>>>>>> issue
>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> well who is calling amdgpu_device_gpu_recover() in 
>>>>>>>>>>>>>>>>>> this case?
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> When it's not the scheduler we shouldn't have a 
>>>>>>>>>>>>>>>>>> guilty job
>>>>>>>>>>>>>>> in the first place.
>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>> Hi Chrisitan,
>>>>>>>>>>>>>>>>>>>          No, I am with the new branch and also has the
>>>>>>>>>>>>>>> patch. Even it
>>>>>>>>>>>>>>>>>>> are freed by
>>>>>>>>>>>>>>>>>> main scheduler, how we could avoid main scheduler to 
>>>>>>>>>>>>>>>>>> free
>>>>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>>>>>> enter to function amdgpu_device_gpu_recover?
>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> in this case you are on an old code branch.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Jobs are freed now by the main scheduler thread and 
>>>>>>>>>>>>>>>>>>>> only
>>>>>>>>>>>>>>> if no
>>>>>>>>>>>>>>>>>>>> timeout handler is running.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> See this patch here:
>>>>>>>>>>>>>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>>>>>>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>>>>>>           Please refer to follow log, when it 
>>>>>>>>>>>>>>>>>>>>> enter to
>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>>>>>>>>>>>>>> function, the bad job 000000005086879e is freeing in
>>>>>>>>>>>>>>> function
>>>>>>>>>>>>>>>>>>>> amdgpu_job_free_cb at the same time, because of the
>>>>>>>>>>>>>>> hardware fence
>>>>>>>>>>>>>>>>>> signal.
>>>>>>>>>>>>>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this 
>>>>>>>>>>>>>>>>>>>> case,
>>>>>>>>>>>>>>>>>>>> the s_fence is already freed, but job is not freed 
>>>>>>>>>>>>>>>>>>>> in time.
>>>>>>>>>>>>>>> Then this issue
>>>>>>>>>>>>>>>> occurs.
>>>>>>>>>>>>>>>>>>>>> [ 449.792189] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>>>>>>>>> *ERROR* ring
>>>>>>>>>>>>>>>>>> sdma0
>>>>>>>>>>>>>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [
>>>>>>>>>>>>>>>>>>>>> 449.793202] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>> *ERROR*
>>>>>>>>>>>>>>>>>>>>> Process
>>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>>>> process pid 0 thread pid 0, s_job:000000005086879e [
>>>>>>>>>>>>>>> 449.794163]
>>>>>>>>>>>>>>>>>>>> amdgpu
>>>>>>>>>>>>>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>>>>>>>> [ 449.794175] Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>>>>>>>> 449.794221]
>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:0000000066eb74ab [ 449.794222]
>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000d4438ad9 [ 449.794255]
>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000b6d69c65 [ 449.794257]
>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>>> thread pid 0,
>>>>>>>>>>>>>>>>>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>>>>>>> information: process pid 0 thread pid 0,
>>>>>>>>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>>>>>>>>>>>>>> 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>>>>>>>>> dereference
>>>>>>>>>>>>>>>>>>>> at
>>>>>>>>>>>>>>>>>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0
>>>>>>> [  449.801040]
>>>>>>>>>>>>>>> Oops:
>>>>>>>>>>>>>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>>>>>>>>>>>>>> [ 449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 
>>>>>>>>>>>>>>>>>>>>> Tainted:
>>>>>>>>>>>>>>> G OE
>>>>>>>>>>>>>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>>>>>>>>>>>>>> [ 449.802157] Hardware name: QEMU Standard PC (i440FX
>>>>>>> +
>>>>>>>>>>>>>>> PIIX,
>>>>>>>>>>>>>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [
>>>>>>>>>>>>>>>>>>>>> 449.802944]
>>>>>>>>>>>>>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched]
>>>>>>> [
>>>>>>>>>>>>>>>>>>>>> 449.803488]
>>>>>>>>>>>>>>>>>> RIP:
>>>>>>>>>>>>>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>>>>>>>>>>>>>> [ 449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 
>>>>>>>>>>>>>>>>>>>>> 0f 85
>>>>>>>>>>>>>>> 56 ff ff
>>>>>>>>>>>>>>>>>>>>> ff
>>>>>>>>>>>>>>>>>>>>> 45 85 e4 0f
>>>>>>>>>>>>>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 
>>>>>>>>>>>>>>>>>>>> 00 48
>>>>>>>>>>>>>>> 8b 40 10
>>>>>>>>>>>>>>>>>>>> <48> 8b
>>>>>>>>>>>>>>>>>> 98
>>>>>>>>>>>>>>>>>>>> c0 00 00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43
>>>>>>>>>>>>>>> 48 a8 01
>>>>>>>>>>>>>>>>>>>>> [ 449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS:
>>>>>>>>>>>>>>> 00010286 [
>>>>>>>>>>>>>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX:
>>>>>>> 0000000000000000
>>>>>>>>>>>>>>> RCX:
>>>>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0
>>>>>>> RSI:
>>>>>>>>>>>>>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [ 449.807224]
>>>>>>> RBP:
>>>>>>>>>>>>>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>>>>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>>>>> 449.807818] R10: 0000000000000000 R11:
>>>>>>> 0000000000000148
>>>>>>>>>>>>>>> R12:
>>>>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0
>>>>>>> R14:
>>>>>>>>>>>>>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [ 
>>>>>>>>>>>>>>>>>>>>> 449.809004] FS:
>>>>>>>>>>>>>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>>>>>>>>>>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 
>>>>>>>>>>>>>>>>>>>>> 0000
>>>>>>>>>>>>>>> ES: 0000
>>>>>>>>>>>>>>>> CR0:
>>>>>>>>>>>>>>>>>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0
>>>>>>> CR3:
>>>>>>>>>>>>>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [ 449.810747]
>>>>>>> DR0:
>>>>>>>>>>>>>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>>>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>>>> 449.811344] DR3: 0000000000000000 DR6:
>>>>>>> 00000000fffe0ff0 DR7:
>>>>>>>>>>>>>>>>>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>>>>>>>>>>>>>>> [ 449.812206] amdgpu_job_timedout+0x114/0x140
>>>>>>> [amdgpu]
>>>>>>>>>>>>>>>>>>>>> [ 449.812635] drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>>>>>>>> [amd_sched] [ 449.813139] ?
>>>>>>>>>>>>>>>>>>>>> amdgpu_cgs_destroy_device+0x10/0x10
>>>>>>>>>>>>>>> [amdgpu] [
>>>>>>>>>>>>>>>>>>>>> 449.813609] ? drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>> [amd_sched] [
>>>>>>>>>>>>>>>>>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [ 
>>>>>>>>>>>>>>>>>>>>> 449.814417]
>>>>>>>>>>>>>>>>>>>>> worker_thread+0x34/0x410 [ 449.814728]
>>>>>>>>>>>>>>> kthread+0x121/0x140 [
>>>>>>>>>>>>>>>>>>>>> 449.815004] ? process_one_work+0x3f0/0x3f0 [
>>>>>>>>>>>>>>> 449.815374]  ?
>>>>>>>>>>>>>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>>>>>>>>>>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>>>>>>>>>>>>>>>> pointer
>>>>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>>>>>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> I can't follow how it would be possible for
>>>>>>>>>>>>>>> job->s_fence to be
>>>>>>>>>>>>>>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> So it looks like this patch is just papering over 
>>>>>>>>>>>>>>>>>>>>>> some
>>>>>>>>>>>>>>> bigger issues.
>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>>>>>>>>>>>>>>>>>> pointer
>>>>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>>>>>>>>>>>>>> You need to give me at least enough time to 
>>>>>>>>>>>>>>>>>>>>>>>> wake up
>>>>>>>>>>>>>>>>>>>>>>>> :)
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>>>> From: amd-gfx
>>>>>>>>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>>>>>>>>>>>> Behalf
>>>>>>>>>>>>>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>>>>>>>>>>>>>> To: Koenig, Christian 
>>>>>>>>>>>>>>>>>>>>>>>>>> <Christian.Koenig@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>>>> amd- gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>>>>> From: Christian König
>>>>>>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>>>>>>>>>>>>>> When the job is already signaled, the 
>>>>>>>>>>>>>>>>>>>>>>>>>>>> s_fence is
>>>>>>>>>>>>>>> freed.
>>>>>>>>>>>>>>>>>>>>>>>>>>>> Then it will has null pointer in
>>>>>>>>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when 
>>>>>>>>>>>>>>>>>>>>>>>>>>> the job
>>>>>>>>>>>>>>> is destroyed.
>>>>>>>>>>>>>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>>>>>>>>>>>>>> I know it is set to NULL in 
>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup.
>>>>>>>>>>>>>>> But in one
>>>>>>>>>>>>>>>>>>>>>>>>>> case, when it enter into the
>>>>>>>>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>>>>>>>>>>>>>>>>>>>> already in drm_sched_job_cleanup, and at this 
>>>>>>>>>>>>>>>>>>>>>>>>>> time,
>>>>>>>>>>>>>>> it will
>>>>>>>>>>>>>>>>>>>>>>>>>> go to free
>>>>>>>>>>>>>>>>>>>> job.
>>>>>>>>>>>>>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is
>>>>>>>>>>>>>>> faster. At
>>>>>>>>>>>>>>>>>>>>>>>>>> that time, job is not freed, but s_fence is 
>>>>>>>>>>>>>>>>>>>>>>>>>> already
>>>>>>>>>>>>>>> NULL.
>>>>>>>>>>>>>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_ring_priority_put(ring, 
>>>>>>>>>>>>>>>>>>>>>>>>> s_job->s_priority);
>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>>>>>>>>>>>>> kfree(job);
>>>>>>>>>>>>>>>>>>>>>>>> The job itself is freed up directly after 
>>>>>>>>>>>>>>>>>>>>>>>> freeing the
>>>>>>>>>>>>>>> reference
>>>>>>>>>>>>>>>>>>>>>>>> to the
>>>>>>>>>>>>>>>>>>>> s_fence.
>>>>>>>>>>>>>>>>>>>>>>>> So you are just papering over a much bigger 
>>>>>>>>>>>>>>>>>>>>>>>> problem
>>>>>>>>>>>>>>> here. This
>>>>>>>>>>>>>>>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> When you see a job without an s_fence then that
>>>>>>>>>>>>>>> means the
>>>>>>>>>>>>>>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> |  2
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +- drivers/gpu/drm/scheduler/sched_main.c |
>>>>>>>>>>>>>>>>>>>>>>>>>>>> 11
>>>>>>>>>>>>>>> ++++++---
>>>>>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>>>>>>>>>>>>>>> 2 files changed, 7 insertions(+), 6
>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +++
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>>>>              * job->base holds a reference to
>>>>>>>>>>>>>>> parent fence
>>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>>>>>>>>>>>>>>>> &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence-
>>>>>>>> parent))
>>>>>>>>>>>>>>>>>>>>>>>>>>>> job_signaled = true;
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>>>>>>>>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(entity, tmp,
>>>>>>>>>>>>>>>>>> &rq-
>>>>>>>>>>>>>>>>>>>>>>> entities,
>>>>>>>>>>>>>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>> - if
>>>>>>>>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>>>>>>>>>>>>>>>> ==
>>>>>>>>>>>>>>>>>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>>>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>>>>>>>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>>>>>>>>>>>>> bad->sched-
>>>>>>>>>>>>>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>>>>> (entity-
>>>>>>>>>>>>>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>>> * This iteration is thread safe as
>>>>>>>>>>>>>>> sched thread
>>>>>>>>>>>>>>>>>>>>>>>>>>>> is
>>>>>>>>>>>>>>>>>> stopped.
>>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +          if (s_job->s_fence &&
>>>>>>>>>>>>>>> s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>> @@ -
>>>>>>>>>>>>>>>>>>>>>> 395,7
>>>>>>>>>>>>>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>>>> * Job is still alive so fence refcount at
>>>>>>>>>>>>>>>>>> least 1
>>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>>>>> + if (s_job->s_fence)
>>>>>>>>>>>>>>>>>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>>>>>>>>>>>>>>>>> finished,
>>>>>>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>>>>> /*
>>>>>>>>>>>>>>>>>>>>>>>>>>>> * We must keep bad job alive for later
>>>>>>>>>>>>>>>>>> use
>>>>>>>>>>>>>>>>>>>>>> during @@
>>>>>>>>>>>>>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>> *sched,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>>>>>>>>>>>>> * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>>>>>>>>>>>> - struct dma_fence *fence =
>>>>>>>>>>>>>>> s_job->s_fence->parent;
>>>>>>>>>>>>>>>>>>>>>>>>>>>> + struct dma_fence *fence =
>>>>>>>>>>>>>>> s_job->s_fence ?
>>>>>>>>>>>>>>>>>>>>>>>>>>>> + s_job-
>>>>>>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>> _______________________________________________
>>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx mailing list 
>>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>> _______________________________________________
>>>>>>>>> amd-gfx mailing list
>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>
>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-18 20:01                                                                                                                     ` Christian König
  0 siblings, 0 replies; 80+ messages in thread
From: Christian König @ 2019-11-18 20:01 UTC (permalink / raw)
  To: Andrey Grodzovsky, Christian König, Deng, Emily, amd-gfx

Well then we should probably update the documentation.

Take a look at the implementation, there is no compiler or SMP barrier 
at all.

Christian.

Am 18.11.19 um 18:01 schrieb Andrey Grodzovsky:
> The documentation states it can be used safely with concurrent 
> list_del_init so I assume it's true - but I think my even bigger 
> mistake is that without locking i just do list_first_entry right after 
> list_empty_careful and by this can grab pointer to the same job as 
> concurrent drm_sched_job_timedout->list_first_entry_or_null - so yes I 
> see now i have to use locking as you advised there and then i don't 
> need the list_empty_careful.
>
> Andrey
>
> On 11/18/19 11:44 AM, Christian König wrote:
>> list_empty_careful() should only be used for optimizing cases, but 
>> never if you need to rely on the result.
>>
>> The problem is that the function doesn't has any memory barriers 
>> whatsoever, it just checks if the next and prev pointer are both 
>> empty instead of just the next pointer.
>>
>> Christian.
>>
>> Am 18.11.19 um 17:23 schrieb Andrey Grodzovsky:
>>> Can you explain why ? As I see it - list_empty_careful is 
>>> specifically designed for the case where the only other concurrent 
>>> operation in progress is list_del_init 
>>> (https://www.kernel.org/doc/htmldocs/kernel-api/API-list-empty-careful.html) 
>>> - which is exactly what happens in this patch, no other list 
>>> altering operation can take place concurrently - so it looks safe to 
>>> use for me.
>>>
>>> Andrey
>>>
>>> On 11/18/19 11:16 AM, Christian König wrote:
>>>> Hi Andrey,
>>>>
>>>> the only thing which doesn't looks so good is the switch to 
>>>> list_empty_careful in drm_sched_cleanup_jobs.
>>>>
>>>> We either take the lock here or we don't, but please not that extra 
>>>> checking.
>>>>
>>>> Christian.
>>>>
>>>> Am 18.11.19 um 15:07 schrieb Andrey Grodzovsky:
>>>>> Thanks Emily.
>>>>>
>>>>> Christan - ping for review.
>>>>>
>>>>> Andrey
>>>>>
>>>>> On 11/14/19 11:39 PM, Deng, Emily wrote:
>>>>>> Hi Andrey,
>>>>>>       Currently, I am busying with another issue, maybe will try 
>>>>>> next week.
>>>>>>
>>>>>> Best wishes
>>>>>> Emily Deng
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>> Sent: Friday, November 15, 2019 6:14 AM
>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
>>>>>>>
>>>>>>> Attached.
>>>>>>>
>>>>>>> Emily - can you give it a try ?
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>> On 11/14/19 3:12 AM, Christian König wrote:
>>>>>>>>> What about instead of peeking at the job to actually remove it 
>>>>>>>>> from
>>>>>>>>> ring_mirror_list right there,
>>>>>>>> Also an interesting idea. We would need to protect the mirror list
>>>>>>>> with a lock again, but that should be the lesser evil.
>>>>>>>>
>>>>>>>> Maybe prototype that and see if it works or not.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>>>>>>>>
>>>>>>>>> On 11/13/19 9:20 AM, Christian König wrote:
>>>>>>>>>> Another more fundamental question: Could we get rid of the 
>>>>>>>>>> timeout
>>>>>>>>>> job at all?
>>>>>>>>>
>>>>>>>>> There are other stuff there besides picking the first 
>>>>>>>>> unfinished job
>>>>>>>>> which is common for all the drivers - such as freeing guilty 
>>>>>>>>> signaled
>>>>>>>>> job and rearming the timeout work timer.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> I mean we used to give this as parameter to the scheduler 
>>>>>>>>>> callback
>>>>>>>>>> because we had the timeout worker in the job, but that is no 
>>>>>>>>>> longer
>>>>>>>>>> the case.
>>>>>>>>>>
>>>>>>>>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>>>>>>>>          job = 
>>>>>>>>>>> list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>>>>>>                                         struct 
>>>>>>>>>>> drm_sched_job, node);
>>>>>>>>>> Why don't we just remove that here and only get the first job 
>>>>>>>>>> after
>>>>>>>>>> we have stopped the scheduler?
>>>>>>>>>
>>>>>>>>> Should be ok since we have the extra check for 
>>>>>>>>> __kthread_should_park
>>>>>>>>> in drm_sched_cleanup_jobs which will protect us in this case 
>>>>>>>>> from a
>>>>>>>>> wakeup of sched thread and execution of in drm_sched_cleanup_jobs
>>>>>>>>> after we already parked it. The problem here is we need the
>>>>>>>>> drm_sched_job to access the private data for each client 
>>>>>>>>> driver (see
>>>>>>>>> amdgpu_job_timedout for example). What about instead of 
>>>>>>>>> peeking at
>>>>>>>>> the job to actually remove it from ring_mirror_list right 
>>>>>>>>> there, go
>>>>>>>>> ahead with it through the reset routine, if it's signaled in the
>>>>>>>>> meanwhile that great - release it, otherwise put it back into
>>>>>>>>> ring_mirror_list in drm_sched_resubmit_jobs.
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>>>>>>>> This why I asked for a trace with timer enabled, but since 
>>>>>>>>>>> there is
>>>>>>>>>>> a finite number of places we touch the timer Emily can just put
>>>>>>>>>>> prints there. Also, I wonder if this temp fix helps her with 
>>>>>>>>>>> the
>>>>>>>>>>> issue or not.
>>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>>>>>>>>> The question is where do we rearm the timer for this 
>>>>>>>>>>>> problem to
>>>>>>>>>>>> occur?
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>>>>>>>> I was able to reproduce the crash by using the attached
>>>>>>>>>>>>> simulate_crash.patch - waiting on guilty job to signal in 
>>>>>>>>>>>>> reset
>>>>>>>>>>>>> work and artificially rearming the timeout timer just 
>>>>>>>>>>>>> before the
>>>>>>>>>>>>> check for !cancel_delayed_work(&sched->work_tdr) in
>>>>>>>>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. 
>>>>>>>>>>>>> This I
>>>>>>>>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>>>>>>>>
>>>>>>>>>>>>> basic_fix.patch handles this by testing whether another timer
>>>>>>>>>>>>> already armed ob this scheduler or is there a timeout work in
>>>>>>>>>>>>> execution right now (see documentation for work_busy) - 
>>>>>>>>>>>>> obviously
>>>>>>>>>>>>> this is not a full solution as this will not protect from 
>>>>>>>>>>>>> races
>>>>>>>>>>>>> if for example there is immediate work scheduling such as in
>>>>>>>>>>>>> drm_sched_fault -  so we probably need to account for this by
>>>>>>>>>>>>> making drm_sched_cleanup_jobs (at least in the part where it
>>>>>>>>>>>>> iterates ring mirror list and frees jobs) and GPU reset 
>>>>>>>>>>>>> really
>>>>>>>>>>>>> mutually exclusive and not like now.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> you need to print which scheduler instance is freeing the 
>>>>>>>>>>>>>> jobs
>>>>>>>>>>>>>> and which one is triggering the reset. The TID and PID is
>>>>>>>>>>>>>> completely meaningless here since we are called from 
>>>>>>>>>>>>>> different
>>>>>>>>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Apart from that I will look into this a bit deeper when I 
>>>>>>>>>>>>>> have
>>>>>>>>>>>>>> time.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>      I add the follow print in function 
>>>>>>>>>>>>>>> drm_sched_cleanup_jobs.
>>>>>>>>>>>>>>>  From the log it shows that only use cancel_delayed_work 
>>>>>>>>>>>>>>> could
>>>>>>>>>>>>>>> not avoid to free job when the sched is in reset. But don’t
>>>>>>>>>>>>>>> know exactly where it is wrong about the driver. Do you 
>>>>>>>>>>>>>>> have
>>>>>>>>>>>>>>> any suggestion about this?
>>>>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu,
>>>>>>>>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>>>>>>>>          /*
>>>>>>>>>>>>>>>           * Don't destroy jobs while the timeout worker is
>>>>>>>>>>>>>>> running  OR thread
>>>>>>>>>>>>>>>           * is being parked and hence assumed to not touch
>>>>>>>>>>>>>>> ring_mirror_list
>>>>>>>>>>>>>>>           */
>>>>>>>>>>>>>>>           if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>>>>>>>>                  return;
>>>>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, pid:%lu\n",
>>>>>>>>>>>>>>> current->tgid, current->pid);
>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11380.695091] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11380.695104] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>> ring
>>>>>>>>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587 Nov 12
>>>>>>>>>>>>>>> 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>>>>>>>>> Process information: process pid 0 thread pid 0,
>>>>>>>>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603 Nov 12 12:58:20
>>>>>>>>>>>>>>> ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225417] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process 
>>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, 
>>>>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225429] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225473] Emily:drm_sched_cleanup_jobs:begin,tid:2253,
>>>>>>>>>>>>>>> pid:2253
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225486] Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process 
>>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, 
>>>>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>>>>>>>>>> Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, 
>>>>>>>>>>>>>>>> Emily
>>>>>>>>>>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>>>>>>>>>> issue
>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>> Thinking more about this claim - we assume here that if
>>>>>>>>>>>>>>> cancel_delayed_work
>>>>>>>>>>>>>>>> returned true it guarantees that timeout work is not 
>>>>>>>>>>>>>>>> running
>>>>>>>>>>>>>>> but, it merely
>>>>>>>>>>>>>>>> means there was a pending timeout work which was removed
>>>>>>> from
>>>>>>>>>>>>>>>> the workqueue before it's timer elapsed and so it 
>>>>>>>>>>>>>>>> didn't have
>>>>>>>>>>>>>>>> a
>>>>>>>>>>>>>>> chance to be
>>>>>>>>>>>>>>>> dequeued and executed, it doesn't cover already executing
>>>>>>>>>>>>>>> work. So there is a
>>>>>>>>>>>>>>>> possibility where while timeout work started executing 
>>>>>>>>>>>>>>>> another
>>>>>>>>>>>>>>> timeout work
>>>>>>>>>>>>>>>> already got enqueued (maybe through earlier cleanup 
>>>>>>>>>>>>>>>> jobs or
>>>>>>>>>>>>>>> through
>>>>>>>>>>>>>>>> drm_sched_fault) and if at this point another
>>>>>>>>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>>>>>>>>>> cancel_delayed_work(&sched->work_tdr) will return true 
>>>>>>>>>>>>>>>> even
>>>>>>>>>>>>>>> while there is a
>>>>>>>>>>>>>>>> timeout job in progress.
>>>>>>>>>>>>>>>> Unfortunately we cannot change cancel_delayed_work to
>>>>>>>>>>>>>>>> cancel_delayed_work_sync to flush the timeout work as 
>>>>>>>>>>>>>>>> timeout
>>>>>>>>>>>>>>> work itself
>>>>>>>>>>>>>>>> waits for schedule thread  to be parked again when calling
>>>>>>>>>>>>>>> park_thread.
>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> ________________________________________
>>>>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on
>>>>>>>>>>>>>>> behalf of
>>>>>>>>>>>>>>>> Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>> Sent: 08 November 2019 05:35:18
>>>>>>>>>>>>>>>> To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>>>>>>>>>> issue
>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> exactly that can't happen. See here:
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>           /* Don't destroy jobs while the timeout 
>>>>>>>>>>>>>>>>> worker is
>>>>>>>>>>>>>>> running */
>>>>>>>>>>>>>>>>>           if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>>>>>>>>>>>                   return NULL;
>>>>>>>>>>>>>>>> We never free jobs while the timeout working is running to
>>>>>>>>>>>>>>> prevent exactly
>>>>>>>>>>>>>>>> that issue.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>>         The drm_sched_job_timedout-> 
>>>>>>>>>>>>>>>>> amdgpu_job_timedout call
>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover. I mean the main scheduler 
>>>>>>>>>>>>>>>> free the
>>>>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>>>> in amdgpu_device_gpu_recover, and before calling
>>>>>>> drm_sched_stop.
>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>>>>>>>>>>>> issue
>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> well who is calling amdgpu_device_gpu_recover() in 
>>>>>>>>>>>>>>>>>> this case?
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> When it's not the scheduler we shouldn't have a 
>>>>>>>>>>>>>>>>>> guilty job
>>>>>>>>>>>>>>> in the first place.
>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>> Hi Chrisitan,
>>>>>>>>>>>>>>>>>>>          No, I am with the new branch and also has the
>>>>>>>>>>>>>>> patch. Even it
>>>>>>>>>>>>>>>>>>> are freed by
>>>>>>>>>>>>>>>>>> main scheduler, how we could avoid main scheduler to 
>>>>>>>>>>>>>>>>>> free
>>>>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>>>>>> enter to function amdgpu_device_gpu_recover?
>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> in this case you are on an old code branch.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Jobs are freed now by the main scheduler thread and 
>>>>>>>>>>>>>>>>>>>> only
>>>>>>>>>>>>>>> if no
>>>>>>>>>>>>>>>>>>>> timeout handler is running.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> See this patch here:
>>>>>>>>>>>>>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>>>>>>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>>>>>>           Please refer to follow log, when it 
>>>>>>>>>>>>>>>>>>>>> enter to
>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>>>>>>>>>>>>>> function, the bad job 000000005086879e is freeing in
>>>>>>>>>>>>>>> function
>>>>>>>>>>>>>>>>>>>> amdgpu_job_free_cb at the same time, because of the
>>>>>>>>>>>>>>> hardware fence
>>>>>>>>>>>>>>>>>> signal.
>>>>>>>>>>>>>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this 
>>>>>>>>>>>>>>>>>>>> case,
>>>>>>>>>>>>>>>>>>>> the s_fence is already freed, but job is not freed 
>>>>>>>>>>>>>>>>>>>> in time.
>>>>>>>>>>>>>>> Then this issue
>>>>>>>>>>>>>>>> occurs.
>>>>>>>>>>>>>>>>>>>>> [ 449.792189] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>>>>>>>>> *ERROR* ring
>>>>>>>>>>>>>>>>>> sdma0
>>>>>>>>>>>>>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [
>>>>>>>>>>>>>>>>>>>>> 449.793202] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>> *ERROR*
>>>>>>>>>>>>>>>>>>>>> Process
>>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>>>> process pid 0 thread pid 0, s_job:000000005086879e [
>>>>>>>>>>>>>>> 449.794163]
>>>>>>>>>>>>>>>>>>>> amdgpu
>>>>>>>>>>>>>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>>>>>>>> [ 449.794175] Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>>>>> process pid 0 thread  pid 0, s_job:000000005086879e [
>>>>>>>>>>>>>>> 449.794221]
>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:0000000066eb74ab [ 449.794222]
>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000d4438ad9 [ 449.794255]
>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000b6d69c65 [ 449.794257]
>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: process
>>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>>> thread pid 0,
>>>>>>>>>>>>>>>>>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>>>>>>> information: process pid 0 thread pid 0,
>>>>>>>>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>>>>>>>>>>>>>> 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>>>>>>>>> dereference
>>>>>>>>>>>>>>>>>>>> at
>>>>>>>>>>>>>>>>>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0
>>>>>>> [  449.801040]
>>>>>>>>>>>>>>> Oops:
>>>>>>>>>>>>>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>>>>>>>>>>>>>> [ 449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 
>>>>>>>>>>>>>>>>>>>>> Tainted:
>>>>>>>>>>>>>>> G OE
>>>>>>>>>>>>>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>>>>>>>>>>>>>> [ 449.802157] Hardware name: QEMU Standard PC (i440FX
>>>>>>> +
>>>>>>>>>>>>>>> PIIX,
>>>>>>>>>>>>>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [
>>>>>>>>>>>>>>>>>>>>> 449.802944]
>>>>>>>>>>>>>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched]
>>>>>>> [
>>>>>>>>>>>>>>>>>>>>> 449.803488]
>>>>>>>>>>>>>>>>>> RIP:
>>>>>>>>>>>>>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>>>>>>>>>>>>>> [ 449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 
>>>>>>>>>>>>>>>>>>>>> 0f 85
>>>>>>>>>>>>>>> 56 ff ff
>>>>>>>>>>>>>>>>>>>>> ff
>>>>>>>>>>>>>>>>>>>>> 45 85 e4 0f
>>>>>>>>>>>>>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 
>>>>>>>>>>>>>>>>>>>> 00 48
>>>>>>>>>>>>>>> 8b 40 10
>>>>>>>>>>>>>>>>>>>> <48> 8b
>>>>>>>>>>>>>>>>>> 98
>>>>>>>>>>>>>>>>>>>> c0 00 00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43
>>>>>>>>>>>>>>> 48 a8 01
>>>>>>>>>>>>>>>>>>>>> [ 449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS:
>>>>>>>>>>>>>>> 00010286 [
>>>>>>>>>>>>>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX:
>>>>>>> 0000000000000000
>>>>>>>>>>>>>>> RCX:
>>>>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0
>>>>>>> RSI:
>>>>>>>>>>>>>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [ 449.807224]
>>>>>>> RBP:
>>>>>>>>>>>>>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>>>>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>>>>> 449.807818] R10: 0000000000000000 R11:
>>>>>>> 0000000000000148
>>>>>>>>>>>>>>> R12:
>>>>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0
>>>>>>> R14:
>>>>>>>>>>>>>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [ 
>>>>>>>>>>>>>>>>>>>>> 449.809004] FS:
>>>>>>>>>>>>>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>>>>>>>>>>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 DS: 
>>>>>>>>>>>>>>>>>>>>> 0000
>>>>>>>>>>>>>>> ES: 0000
>>>>>>>>>>>>>>>> CR0:
>>>>>>>>>>>>>>>>>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0
>>>>>>> CR3:
>>>>>>>>>>>>>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [ 449.810747]
>>>>>>> DR0:
>>>>>>>>>>>>>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>>>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>>>> 449.811344] DR3: 0000000000000000 DR6:
>>>>>>> 00000000fffe0ff0 DR7:
>>>>>>>>>>>>>>>>>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>>>>>>>>>>>>>>> [ 449.812206] amdgpu_job_timedout+0x114/0x140
>>>>>>> [amdgpu]
>>>>>>>>>>>>>>>>>>>>> [ 449.812635] drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>>>>>>>> [amd_sched] [ 449.813139] ?
>>>>>>>>>>>>>>>>>>>>> amdgpu_cgs_destroy_device+0x10/0x10
>>>>>>>>>>>>>>> [amdgpu] [
>>>>>>>>>>>>>>>>>>>>> 449.813609] ? drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>> [amd_sched] [
>>>>>>>>>>>>>>>>>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [ 
>>>>>>>>>>>>>>>>>>>>> 449.814417]
>>>>>>>>>>>>>>>>>>>>> worker_thread+0x34/0x410 [ 449.814728]
>>>>>>>>>>>>>>> kthread+0x121/0x140 [
>>>>>>>>>>>>>>>>>>>>> 449.815004] ? process_one_work+0x3f0/0x3f0 [
>>>>>>>>>>>>>>> 449.815374]  ?
>>>>>>>>>>>>>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>>>>>>>>>>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>>>>>>>>>>>>>>>> pointer
>>>>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>>>>>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> I can't follow how it would be possible for
>>>>>>>>>>>>>>> job->s_fence to be
>>>>>>>>>>>>>>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> So it looks like this patch is just papering over 
>>>>>>>>>>>>>>>>>>>>>> some
>>>>>>>>>>>>>>> bigger issues.
>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>>>>>>>>>>>>>>>>>> pointer
>>>>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>>>>>>>>>>>>>> You need to give me at least enough time to 
>>>>>>>>>>>>>>>>>>>>>>>> wake up
>>>>>>>>>>>>>>>>>>>>>>>> :)
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>>>> From: amd-gfx
>>>>>>>>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>>>>>>>>>>>> Behalf
>>>>>>>>>>>>>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>>>>>>>>>>>>>> To: Koenig, Christian 
>>>>>>>>>>>>>>>>>>>>>>>>>> <Christian.Koenig@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>>>> amd- gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>>>>> From: Christian König
>>>>>>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>>>>>>>>>>>>>> When the job is already signaled, the 
>>>>>>>>>>>>>>>>>>>>>>>>>>>> s_fence is
>>>>>>>>>>>>>>> freed.
>>>>>>>>>>>>>>>>>>>>>>>>>>>> Then it will has null pointer in
>>>>>>>>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when 
>>>>>>>>>>>>>>>>>>>>>>>>>>> the job
>>>>>>>>>>>>>>> is destroyed.
>>>>>>>>>>>>>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>>>>>>>>>>>>>> I know it is set to NULL in 
>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup.
>>>>>>>>>>>>>>> But in one
>>>>>>>>>>>>>>>>>>>>>>>>>> case, when it enter into the
>>>>>>>>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>>>>>>>>>>>>>>>>>>>> already in drm_sched_job_cleanup, and at this 
>>>>>>>>>>>>>>>>>>>>>>>>>> time,
>>>>>>>>>>>>>>> it will
>>>>>>>>>>>>>>>>>>>>>>>>>> go to free
>>>>>>>>>>>>>>>>>>>> job.
>>>>>>>>>>>>>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is
>>>>>>>>>>>>>>> faster. At
>>>>>>>>>>>>>>>>>>>>>>>>>> that time, job is not freed, but s_fence is 
>>>>>>>>>>>>>>>>>>>>>>>>>> already
>>>>>>>>>>>>>>> NULL.
>>>>>>>>>>>>>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_ring_priority_put(ring, 
>>>>>>>>>>>>>>>>>>>>>>>>> s_job->s_priority);
>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>>>>>>>>>>>>> kfree(job);
>>>>>>>>>>>>>>>>>>>>>>>> The job itself is freed up directly after 
>>>>>>>>>>>>>>>>>>>>>>>> freeing the
>>>>>>>>>>>>>>> reference
>>>>>>>>>>>>>>>>>>>>>>>> to the
>>>>>>>>>>>>>>>>>>>> s_fence.
>>>>>>>>>>>>>>>>>>>>>>>> So you are just papering over a much bigger 
>>>>>>>>>>>>>>>>>>>>>>>> problem
>>>>>>>>>>>>>>> here. This
>>>>>>>>>>>>>>>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> When you see a job without an s_fence then that
>>>>>>>>>>>>>>> means the
>>>>>>>>>>>>>>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> |  2
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +- drivers/gpu/drm/scheduler/sched_main.c |
>>>>>>>>>>>>>>>>>>>>>>>>>>>> 11
>>>>>>>>>>>>>>> ++++++---
>>>>>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>>>>>>>>>>>>>>> 2 files changed, 7 insertions(+), 6
>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +++
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>>>>              * job->base holds a reference to
>>>>>>>>>>>>>>> parent fence
>>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>>>>>>>>>>>>>>>> &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence-
>>>>>>>> parent))
>>>>>>>>>>>>>>>>>>>>>>>>>>>> job_signaled = true;
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>>>>>>>>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(entity, tmp,
>>>>>>>>>>>>>>>>>> &rq-
>>>>>>>>>>>>>>>>>>>>>>> entities,
>>>>>>>>>>>>>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>> - if
>>>>>>>>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>>>>>>>>>>>>>>>> ==
>>>>>>>>>>>>>>>>>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +                          if (bad->s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>>>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>>>>>>>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>>>>>>>>>>>>> bad->sched-
>>>>>>>>>>>>>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>>>>> (entity-
>>>>>>>>>>>>>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>>> * This iteration is thread safe as
>>>>>>>>>>>>>>> sched thread
>>>>>>>>>>>>>>>>>>>>>>>>>>>> is
>>>>>>>>>>>>>>>>>> stopped.
>>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +          if (s_job->s_fence &&
>>>>>>>>>>>>>>> s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>> @@ -
>>>>>>>>>>>>>>>>>>>>>> 395,7
>>>>>>>>>>>>>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>>>> * Job is still alive so fence refcount at
>>>>>>>>>>>>>>>>>> least 1
>>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>>>>> + if (s_job->s_fence)
>>>>>>>>>>>>>>>>>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>>>>>>>>>>>>>>>>> finished,
>>>>>>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>>>>> /*
>>>>>>>>>>>>>>>>>>>>>>>>>>>> * We must keep bad job alive for later
>>>>>>>>>>>>>>>>>> use
>>>>>>>>>>>>>>>>>>>>>> during @@
>>>>>>>>>>>>>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>> *sched,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>>>>>>>>>>>>> * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>>>>>>>>>>>> - struct dma_fence *fence =
>>>>>>>>>>>>>>> s_job->s_fence->parent;
>>>>>>>>>>>>>>>>>>>>>>>>>>>> + struct dma_fence *fence =
>>>>>>>>>>>>>>> s_job->s_fence ?
>>>>>>>>>>>>>>>>>>>>>>>>>>>> + s_job-
>>>>>>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>> _______________________________________________
>>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx mailing list 
>>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>> _______________________________________________
>>>>>>>>> amd-gfx mailing list
>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>
>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-18 21:05                                                                                                                         ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-18 21:05 UTC (permalink / raw)
  To: christian.koenig-5C7GfCeVMHo, Deng, Emily,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Ye, I noticed - I just relied on the documentation in this case.

Andrey

On 11/18/19 3:01 PM, Christian König wrote:
> Well then we should probably update the documentation.
>
> Take a look at the implementation, there is no compiler or SMP barrier 
> at all.
>
> Christian.
>
> Am 18.11.19 um 18:01 schrieb Andrey Grodzovsky:
>> The documentation states it can be used safely with concurrent 
>> list_del_init so I assume it's true - but I think my even bigger 
>> mistake is that without locking i just do list_first_entry right 
>> after list_empty_careful and by this can grab pointer to the same job 
>> as concurrent drm_sched_job_timedout->list_first_entry_or_null - so 
>> yes I see now i have to use locking as you advised there and then i 
>> don't need the list_empty_careful.
>>
>> Andrey
>>
>> On 11/18/19 11:44 AM, Christian König wrote:
>>> list_empty_careful() should only be used for optimizing cases, but 
>>> never if you need to rely on the result.
>>>
>>> The problem is that the function doesn't has any memory barriers 
>>> whatsoever, it just checks if the next and prev pointer are both 
>>> empty instead of just the next pointer.
>>>
>>> Christian.
>>>
>>> Am 18.11.19 um 17:23 schrieb Andrey Grodzovsky:
>>>> Can you explain why ? As I see it - list_empty_careful is 
>>>> specifically designed for the case where the only other concurrent 
>>>> operation in progress is list_del_init 
>>>> (https://www.kernel.org/doc/htmldocs/kernel-api/API-list-empty-careful.html) 
>>>> - which is exactly what happens in this patch, no other list 
>>>> altering operation can take place concurrently - so it looks safe 
>>>> to use for me.
>>>>
>>>> Andrey
>>>>
>>>> On 11/18/19 11:16 AM, Christian König wrote:
>>>>> Hi Andrey,
>>>>>
>>>>> the only thing which doesn't looks so good is the switch to 
>>>>> list_empty_careful in drm_sched_cleanup_jobs.
>>>>>
>>>>> We either take the lock here or we don't, but please not that 
>>>>> extra checking.
>>>>>
>>>>> Christian.
>>>>>
>>>>> Am 18.11.19 um 15:07 schrieb Andrey Grodzovsky:
>>>>>> Thanks Emily.
>>>>>>
>>>>>> Christan - ping for review.
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>> On 11/14/19 11:39 PM, Deng, Emily wrote:
>>>>>>> Hi Andrey,
>>>>>>>       Currently, I am busying with another issue, maybe will try 
>>>>>>> next week.
>>>>>>>
>>>>>>> Best wishes
>>>>>>> Emily Deng
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>> Sent: Friday, November 15, 2019 6:14 AM
>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for 
>>>>>>>> tdr
>>>>>>>>
>>>>>>>> Attached.
>>>>>>>>
>>>>>>>> Emily - can you give it a try ?
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>> On 11/14/19 3:12 AM, Christian König wrote:
>>>>>>>>>> What about instead of peeking at the job to actually remove 
>>>>>>>>>> it from
>>>>>>>>>> ring_mirror_list right there,
>>>>>>>>> Also an interesting idea. We would need to protect the mirror 
>>>>>>>>> list
>>>>>>>>> with a lock again, but that should be the lesser evil.
>>>>>>>>>
>>>>>>>>> Maybe prototype that and see if it works or not.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>>>>>>>>>
>>>>>>>>>> On 11/13/19 9:20 AM, Christian König wrote:
>>>>>>>>>>> Another more fundamental question: Could we get rid of the 
>>>>>>>>>>> timeout
>>>>>>>>>>> job at all?
>>>>>>>>>>
>>>>>>>>>> There are other stuff there besides picking the first 
>>>>>>>>>> unfinished job
>>>>>>>>>> which is common for all the drivers - such as freeing guilty 
>>>>>>>>>> signaled
>>>>>>>>>> job and rearming the timeout work timer.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> I mean we used to give this as parameter to the scheduler 
>>>>>>>>>>> callback
>>>>>>>>>>> because we had the timeout worker in the job, but that is no 
>>>>>>>>>>> longer
>>>>>>>>>>> the case.
>>>>>>>>>>>
>>>>>>>>>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>>>>>>>>>          job = 
>>>>>>>>>>>> list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>>>>>>> struct drm_sched_job, node);
>>>>>>>>>>> Why don't we just remove that here and only get the first 
>>>>>>>>>>> job after
>>>>>>>>>>> we have stopped the scheduler?
>>>>>>>>>>
>>>>>>>>>> Should be ok since we have the extra check for 
>>>>>>>>>> __kthread_should_park
>>>>>>>>>> in drm_sched_cleanup_jobs which will protect us in this case 
>>>>>>>>>> from a
>>>>>>>>>> wakeup of sched thread and execution of in 
>>>>>>>>>> drm_sched_cleanup_jobs
>>>>>>>>>> after we already parked it. The problem here is we need the
>>>>>>>>>> drm_sched_job to access the private data for each client 
>>>>>>>>>> driver (see
>>>>>>>>>> amdgpu_job_timedout for example). What about instead of 
>>>>>>>>>> peeking at
>>>>>>>>>> the job to actually remove it from ring_mirror_list right 
>>>>>>>>>> there, go
>>>>>>>>>> ahead with it through the reset routine, if it's signaled in the
>>>>>>>>>> meanwhile that great - release it, otherwise put it back into
>>>>>>>>>> ring_mirror_list in drm_sched_resubmit_jobs.
>>>>>>>>>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>>>>>>>>> This why I asked for a trace with timer enabled, but since 
>>>>>>>>>>>> there is
>>>>>>>>>>>> a finite number of places we touch the timer Emily can just 
>>>>>>>>>>>> put
>>>>>>>>>>>> prints there. Also, I wonder if this temp fix helps her 
>>>>>>>>>>>> with the
>>>>>>>>>>>> issue or not.
>>>>>>>>>>>>
>>>>>>>>>>>> Andrey
>>>>>>>>>>>>
>>>>>>>>>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>>>>>>>>>> The question is where do we rearm the timer for this 
>>>>>>>>>>>>> problem to
>>>>>>>>>>>>> occur?
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>>>>>>>>> I was able to reproduce the crash by using the attached
>>>>>>>>>>>>>> simulate_crash.patch - waiting on guilty job to signal in 
>>>>>>>>>>>>>> reset
>>>>>>>>>>>>>> work and artificially rearming the timeout timer just 
>>>>>>>>>>>>>> before the
>>>>>>>>>>>>>> check for !cancel_delayed_work(&sched->work_tdr) in
>>>>>>>>>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. 
>>>>>>>>>>>>>> This I
>>>>>>>>>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> basic_fix.patch handles this by testing whether another 
>>>>>>>>>>>>>> timer
>>>>>>>>>>>>>> already armed ob this scheduler or is there a timeout 
>>>>>>>>>>>>>> work in
>>>>>>>>>>>>>> execution right now (see documentation for work_busy) - 
>>>>>>>>>>>>>> obviously
>>>>>>>>>>>>>> this is not a full solution as this will not protect from 
>>>>>>>>>>>>>> races
>>>>>>>>>>>>>> if for example there is immediate work scheduling such as in
>>>>>>>>>>>>>> drm_sched_fault -  so we probably need to account for 
>>>>>>>>>>>>>> this by
>>>>>>>>>>>>>> making drm_sched_cleanup_jobs (at least in the part where it
>>>>>>>>>>>>>> iterates ring mirror list and frees jobs) and GPU reset 
>>>>>>>>>>>>>> really
>>>>>>>>>>>>>> mutually exclusive and not like now.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> you need to print which scheduler instance is freeing 
>>>>>>>>>>>>>>> the jobs
>>>>>>>>>>>>>>> and which one is triggering the reset. The TID and PID is
>>>>>>>>>>>>>>> completely meaningless here since we are called from 
>>>>>>>>>>>>>>> different
>>>>>>>>>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Apart from that I will look into this a bit deeper when 
>>>>>>>>>>>>>>> I have
>>>>>>>>>>>>>>> time.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>      I add the follow print in function 
>>>>>>>>>>>>>>>> drm_sched_cleanup_jobs.
>>>>>>>>>>>>>>>>  From the log it shows that only use 
>>>>>>>>>>>>>>>> cancel_delayed_work could
>>>>>>>>>>>>>>>> not avoid to free job when the sched is in reset. But 
>>>>>>>>>>>>>>>> don’t
>>>>>>>>>>>>>>>> know exactly where it is wrong about the driver. Do you 
>>>>>>>>>>>>>>>> have
>>>>>>>>>>>>>>>> any suggestion about this?
>>>>>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu,
>>>>>>>>>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>>>>>>>>>          /*
>>>>>>>>>>>>>>>>           * Don't destroy jobs while the timeout worker is
>>>>>>>>>>>>>>>> running  OR thread
>>>>>>>>>>>>>>>>           * is being parked and hence assumed to not touch
>>>>>>>>>>>>>>>> ring_mirror_list
>>>>>>>>>>>>>>>>           */
>>>>>>>>>>>>>>>>           if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>>>>>>>>>                  return;
>>>>>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, 
>>>>>>>>>>>>>>>> pid:%lu\n",
>>>>>>>>>>>>>>>> current->tgid, current->pid);
>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11380.695091] 
>>>>>>>>>>>>>>>> Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11380.695104] 
>>>>>>>>>>>>>>>> Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11380.695107] 
>>>>>>>>>>>>>>>> Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>> ring
>>>>>>>>>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587 
>>>>>>>>>>>>>>>> Nov 12
>>>>>>>>>>>>>>>> 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>>>>>>>>>> Process information: process pid 0 thread pid 0,
>>>>>>>>>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603 Nov 12 
>>>>>>>>>>>>>>>> 12:58:20
>>>>>>>>>>>>>>>> ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225417] 
>>>>>>>>>>>>>>>> Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225425] 
>>>>>>>>>>>>>>>> Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process 
>>>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, 
>>>>>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225429] 
>>>>>>>>>>>>>>>> Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225473] 
>>>>>>>>>>>>>>>> Emily:drm_sched_cleanup_jobs:begin,tid:2253,
>>>>>>>>>>>>>>>> pid:2253
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225486] 
>>>>>>>>>>>>>>>> Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process 
>>>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, 
>>>>>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>>>>>>>>>>> Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; 
>>>>>>>>>>>>>>>>> Deng, Emily
>>>>>>>>>>>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>>>>>>>>>>> issue
>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>> Thinking more about this claim - we assume here that if
>>>>>>>>>>>>>>>> cancel_delayed_work
>>>>>>>>>>>>>>>>> returned true it guarantees that timeout work is not 
>>>>>>>>>>>>>>>>> running
>>>>>>>>>>>>>>>> but, it merely
>>>>>>>>>>>>>>>>> means there was a pending timeout work which was removed
>>>>>>>> from
>>>>>>>>>>>>>>>>> the workqueue before it's timer elapsed and so it 
>>>>>>>>>>>>>>>>> didn't have
>>>>>>>>>>>>>>>>> a
>>>>>>>>>>>>>>>> chance to be
>>>>>>>>>>>>>>>>> dequeued and executed, it doesn't cover already executing
>>>>>>>>>>>>>>>> work. So there is a
>>>>>>>>>>>>>>>>> possibility where while timeout work started executing 
>>>>>>>>>>>>>>>>> another
>>>>>>>>>>>>>>>> timeout work
>>>>>>>>>>>>>>>>> already got enqueued (maybe through earlier cleanup 
>>>>>>>>>>>>>>>>> jobs or
>>>>>>>>>>>>>>>> through
>>>>>>>>>>>>>>>>> drm_sched_fault) and if at this point another
>>>>>>>>>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>>>>>>>>>>> cancel_delayed_work(&sched->work_tdr) will return true 
>>>>>>>>>>>>>>>>> even
>>>>>>>>>>>>>>>> while there is a
>>>>>>>>>>>>>>>>> timeout job in progress.
>>>>>>>>>>>>>>>>> Unfortunately we cannot change cancel_delayed_work to
>>>>>>>>>>>>>>>>> cancel_delayed_work_sync to flush the timeout work as 
>>>>>>>>>>>>>>>>> timeout
>>>>>>>>>>>>>>>> work itself
>>>>>>>>>>>>>>>>> waits for schedule thread  to be parked again when 
>>>>>>>>>>>>>>>>> calling
>>>>>>>>>>>>>>>> park_thread.
>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> ________________________________________
>>>>>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on
>>>>>>>>>>>>>>>> behalf of
>>>>>>>>>>>>>>>>> Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>> Sent: 08 November 2019 05:35:18
>>>>>>>>>>>>>>>>> To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>>>>>>>>>>> issue
>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> exactly that can't happen. See here:
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> /* Don't destroy jobs while the timeout worker is
>>>>>>>>>>>>>>>> running */
>>>>>>>>>>>>>>>>>> if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>>>>>>>>>>>>                   return NULL;
>>>>>>>>>>>>>>>>> We never free jobs while the timeout working is 
>>>>>>>>>>>>>>>>> running to
>>>>>>>>>>>>>>>> prevent exactly
>>>>>>>>>>>>>>>>> that issue.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>>>         The drm_sched_job_timedout-> 
>>>>>>>>>>>>>>>>>> amdgpu_job_timedout call
>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover. I mean the main scheduler 
>>>>>>>>>>>>>>>>> free the
>>>>>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>>>>> in amdgpu_device_gpu_recover, and before calling
>>>>>>>> drm_sched_stop.
>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> well who is calling amdgpu_device_gpu_recover() in 
>>>>>>>>>>>>>>>>>>> this case?
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> When it's not the scheduler we shouldn't have a 
>>>>>>>>>>>>>>>>>>> guilty job
>>>>>>>>>>>>>>>> in the first place.
>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>> Hi Chrisitan,
>>>>>>>>>>>>>>>>>>>>          No, I am with the new branch and also has the
>>>>>>>>>>>>>>>> patch. Even it
>>>>>>>>>>>>>>>>>>>> are freed by
>>>>>>>>>>>>>>>>>>> main scheduler, how we could avoid main scheduler to 
>>>>>>>>>>>>>>>>>>> free
>>>>>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>>>>>>> enter to function amdgpu_device_gpu_recover?
>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> in this case you are on an old code branch.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Jobs are freed now by the main scheduler thread 
>>>>>>>>>>>>>>>>>>>>> and only
>>>>>>>>>>>>>>>> if no
>>>>>>>>>>>>>>>>>>>>> timeout handler is running.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> See this patch here:
>>>>>>>>>>>>>>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>>>>>>>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>>>>>>>           Please refer to follow log, when it 
>>>>>>>>>>>>>>>>>>>>>> enter to
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>>>>>>>>>>>>>>> function, the bad job 000000005086879e is freeing in
>>>>>>>>>>>>>>>> function
>>>>>>>>>>>>>>>>>>>>> amdgpu_job_free_cb at the same time, because of the
>>>>>>>>>>>>>>>> hardware fence
>>>>>>>>>>>>>>>>>>> signal.
>>>>>>>>>>>>>>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this 
>>>>>>>>>>>>>>>>>>>>> case,
>>>>>>>>>>>>>>>>>>>>> the s_fence is already freed, but job is not freed 
>>>>>>>>>>>>>>>>>>>>> in time.
>>>>>>>>>>>>>>>> Then this issue
>>>>>>>>>>>>>>>>> occurs.
>>>>>>>>>>>>>>>>>>>>>> [ 449.792189] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>>>>>>>>>> *ERROR* ring
>>>>>>>>>>>>>>>>>>> sdma0
>>>>>>>>>>>>>>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [
>>>>>>>>>>>>>>>>>>>>>> 449.793202] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>> *ERROR*
>>>>>>>>>>>>>>>>>>>>>> Process
>>>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>>>>> process pid 0 thread pid 0, s_job:000000005086879e [
>>>>>>>>>>>>>>>> 449.794163]
>>>>>>>>>>>>>>>>>>>>> amdgpu
>>>>>>>>>>>>>>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>>>>>>>>> [ 449.794175] Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>>>>>> process pid 0 thread  pid 0, 
>>>>>>>>>>>>>>>>>>>>>> s_job:000000005086879e [
>>>>>>>>>>>>>>>> 449.794221]
>>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: 
>>>>>>>>>>>>>>>>>>>>>> process
>>>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:0000000066eb74ab [ 449.794222]
>>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: 
>>>>>>>>>>>>>>>>>>>>>> process
>>>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000d4438ad9 [ 449.794255]
>>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: 
>>>>>>>>>>>>>>>>>>>>>> process
>>>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000b6d69c65 [ 449.794257]
>>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: 
>>>>>>>>>>>>>>>>>>>>>> process
>>>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>>>> thread pid 0,
>>>>>>>>>>>>>>>>>>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>>>>>>>> information: process pid 0 thread pid 0,
>>>>>>>>>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>>>>>>>>>>>>>>> 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>>>>>>>>>> dereference
>>>>>>>>>>>>>>>>>>>>> at
>>>>>>>>>>>>>>>>>>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0
>>>>>>>> [  449.801040]
>>>>>>>>>>>>>>>> Oops:
>>>>>>>>>>>>>>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>>>>>>>>>>>>>>> [ 449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 
>>>>>>>>>>>>>>>>>>>>>> Tainted:
>>>>>>>>>>>>>>>> G OE
>>>>>>>>>>>>>>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>>>>>>>>>>>>>>> [ 449.802157] Hardware name: QEMU Standard PC 
>>>>>>>>>>>>>>>>>>>>>> (i440FX
>>>>>>>> +
>>>>>>>>>>>>>>>> PIIX,
>>>>>>>>>>>>>>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [
>>>>>>>>>>>>>>>>>>>>>> 449.802944]
>>>>>>>>>>>>>>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched]
>>>>>>>> [
>>>>>>>>>>>>>>>>>>>>>> 449.803488]
>>>>>>>>>>>>>>>>>>> RIP:
>>>>>>>>>>>>>>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>>>>>>>>>>>>>>> [ 449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 
>>>>>>>>>>>>>>>>>>>>>> 0f 85
>>>>>>>>>>>>>>>> 56 ff ff
>>>>>>>>>>>>>>>>>>>>>> ff
>>>>>>>>>>>>>>>>>>>>>> 45 85 e4 0f
>>>>>>>>>>>>>>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 
>>>>>>>>>>>>>>>>>>>>> 00 48
>>>>>>>>>>>>>>>> 8b 40 10
>>>>>>>>>>>>>>>>>>>>> <48> 8b
>>>>>>>>>>>>>>>>>>> 98
>>>>>>>>>>>>>>>>>>>>> c0 00 00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43
>>>>>>>>>>>>>>>> 48 a8 01
>>>>>>>>>>>>>>>>>>>>>> [ 449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS:
>>>>>>>>>>>>>>>> 00010286 [
>>>>>>>>>>>>>>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX:
>>>>>>>> 0000000000000000
>>>>>>>>>>>>>>>> RCX:
>>>>>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0
>>>>>>>> RSI:
>>>>>>>>>>>>>>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [ 449.807224]
>>>>>>>> RBP:
>>>>>>>>>>>>>>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>>>>>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>>>>>> 449.807818] R10: 0000000000000000 R11:
>>>>>>>> 0000000000000148
>>>>>>>>>>>>>>>> R12:
>>>>>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0
>>>>>>>> R14:
>>>>>>>>>>>>>>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [ 
>>>>>>>>>>>>>>>>>>>>>> 449.809004] FS:
>>>>>>>>>>>>>>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>>>>>>>>>>>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 
>>>>>>>>>>>>>>>>>>>>>> DS: 0000
>>>>>>>>>>>>>>>> ES: 0000
>>>>>>>>>>>>>>>>> CR0:
>>>>>>>>>>>>>>>>>>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0
>>>>>>>> CR3:
>>>>>>>>>>>>>>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [ 449.810747]
>>>>>>>> DR0:
>>>>>>>>>>>>>>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>>>>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>>>>> 449.811344] DR3: 0000000000000000 DR6:
>>>>>>>> 00000000fffe0ff0 DR7:
>>>>>>>>>>>>>>>>>>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>>>>>>>>>>>>>>>> [ 449.812206] amdgpu_job_timedout+0x114/0x140
>>>>>>>> [amdgpu]
>>>>>>>>>>>>>>>>>>>>>> [ 449.812635] drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>>>>>>>>> [amd_sched] [ 449.813139] ?
>>>>>>>>>>>>>>>>>>>>>> amdgpu_cgs_destroy_device+0x10/0x10
>>>>>>>>>>>>>>>> [amdgpu] [
>>>>>>>>>>>>>>>>>>>>>> 449.813609] ? drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>>> [amd_sched] [
>>>>>>>>>>>>>>>>>>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [ 
>>>>>>>>>>>>>>>>>>>>>> 449.814417]
>>>>>>>>>>>>>>>>>>>>>> worker_thread+0x34/0x410 [ 449.814728]
>>>>>>>>>>>>>>>> kthread+0x121/0x140 [
>>>>>>>>>>>>>>>>>>>>>> 449.815004] ? process_one_work+0x3f0/0x3f0 [
>>>>>>>>>>>>>>>> 449.815374]  ?
>>>>>>>>>>>>>>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>>>>>>>>>>>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>>>>>>>>>>>>>>>>> pointer
>>>>>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>>>>>>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> I can't follow how it would be possible for
>>>>>>>>>>>>>>>> job->s_fence to be
>>>>>>>>>>>>>>>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> So it looks like this patch is just papering 
>>>>>>>>>>>>>>>>>>>>>>> over some
>>>>>>>>>>>>>>>> bigger issues.
>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian 
>>>>>>>>>>>>>>>>>>>>>>>>> <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>>>>>>>>>>>>>>>>>>> pointer
>>>>>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>>>>>>>>>>>>>>> You need to give me at least enough time to 
>>>>>>>>>>>>>>>>>>>>>>>>> wake up
>>>>>>>>>>>>>>>>>>>>>>>>> :)
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>>>>> From: amd-gfx
>>>>>>>>>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>>>>>>>>>>>>> Behalf
>>>>>>>>>>>>>>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>>>>>>>>>>>>>>> To: Koenig, Christian 
>>>>>>>>>>>>>>>>>>>>>>>>>>> <Christian.Koenig@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>>>>> amd- gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>>>>>> From: Christian König
>>>>>>>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> When the job is already signaled, the 
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> s_fence is
>>>>>>>>>>>>>>>> freed.
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Then it will has null pointer in
>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when 
>>>>>>>>>>>>>>>>>>>>>>>>>>>> the job
>>>>>>>>>>>>>>>> is destroyed.
>>>>>>>>>>>>>>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>>>>>>>>>>>>>>> I know it is set to NULL in 
>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup.
>>>>>>>>>>>>>>>> But in one
>>>>>>>>>>>>>>>>>>>>>>>>>>> case, when it enter into the
>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>>>>>>>>>>>>>>>>>>>>> already in drm_sched_job_cleanup, and at 
>>>>>>>>>>>>>>>>>>>>>>>>>>> this time,
>>>>>>>>>>>>>>>> it will
>>>>>>>>>>>>>>>>>>>>>>>>>>> go to free
>>>>>>>>>>>>>>>>>>>>> job.
>>>>>>>>>>>>>>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is
>>>>>>>>>>>>>>>> faster. At
>>>>>>>>>>>>>>>>>>>>>>>>>>> that time, job is not freed, but s_fence is 
>>>>>>>>>>>>>>>>>>>>>>>>>>> already
>>>>>>>>>>>>>>>> NULL.
>>>>>>>>>>>>>>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_ring_priority_put(ring, 
>>>>>>>>>>>>>>>>>>>>>>>>>> s_job->s_priority);
>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>>>>>>>>>>>>>> kfree(job);
>>>>>>>>>>>>>>>>>>>>>>>>> The job itself is freed up directly after 
>>>>>>>>>>>>>>>>>>>>>>>>> freeing the
>>>>>>>>>>>>>>>> reference
>>>>>>>>>>>>>>>>>>>>>>>>> to the
>>>>>>>>>>>>>>>>>>>>> s_fence.
>>>>>>>>>>>>>>>>>>>>>>>>> So you are just papering over a much bigger 
>>>>>>>>>>>>>>>>>>>>>>>>> problem
>>>>>>>>>>>>>>>> here. This
>>>>>>>>>>>>>>>>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> When you see a job without an s_fence then 
>>>>>>>>>>>>>>>>>>>>>>>>>>>> that
>>>>>>>>>>>>>>>> means the
>>>>>>>>>>>>>>>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng 
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> |  2
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +- drivers/gpu/drm/scheduler/sched_main.c |
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 11
>>>>>>>>>>>>>>>> ++++++---
>>>>>>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 2 files changed, 7 insertions(+), 6
>>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +++
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>              * job->base holds a reference to
>>>>>>>>>>>>>>>> parent fence
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>>>>>>>>>>>>>>>>> &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence-
>>>>>>>>> parent))
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> job_signaled = true;
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>>>>>>>>>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(entity, tmp,
>>>>>>>>>>>>>>>>>>> &rq-
>>>>>>>>>>>>>>>>>>>>>>>> entities,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - if
>>>>>>>>>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>>>>>>>>>>>>>>>>> ==
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +                          if 
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> (bad->s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>>>>>>>>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> bad->sched-
>>>>>>>>>>>>>>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> (entity-
>>>>>>>>>>>>>>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> * This iteration is thread safe as
>>>>>>>>>>>>>>>> sched thread
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> is
>>>>>>>>>>>>>>>>>>> stopped.
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +          if (s_job->s_fence &&
>>>>>>>>>>>>>>>> s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>>> @@ -
>>>>>>>>>>>>>>>>>>>>>>> 395,7
>>>>>>>>>>>>>>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> * Job is still alive so fence refcount at
>>>>>>>>>>>>>>>>>>> least 1
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + if (s_job->s_fence)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>>>>>>>>>>>>>>>>>> finished,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> /*
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> * We must keep bad job alive for later
>>>>>>>>>>>>>>>>>>> use
>>>>>>>>>>>>>>>>>>>>>>> during @@
>>>>>>>>>>>>>>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>> *sched,
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - struct dma_fence *fence =
>>>>>>>>>>>>>>>> s_job->s_fence->parent;
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + struct dma_fence *fence =
>>>>>>>>>>>>>>>> s_job->s_fence ?
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + s_job-
>>>>>>>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>> _______________________________________________
>>>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx mailing list 
>>>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>> _______________________________________________
>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>
>>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix the null pointer issue for tdr
@ 2019-11-18 21:05                                                                                                                         ` Andrey Grodzovsky
  0 siblings, 0 replies; 80+ messages in thread
From: Andrey Grodzovsky @ 2019-11-18 21:05 UTC (permalink / raw)
  To: christian.koenig, Deng, Emily, amd-gfx

Ye, I noticed - I just relied on the documentation in this case.

Andrey

On 11/18/19 3:01 PM, Christian König wrote:
> Well then we should probably update the documentation.
>
> Take a look at the implementation, there is no compiler or SMP barrier 
> at all.
>
> Christian.
>
> Am 18.11.19 um 18:01 schrieb Andrey Grodzovsky:
>> The documentation states it can be used safely with concurrent 
>> list_del_init so I assume it's true - but I think my even bigger 
>> mistake is that without locking i just do list_first_entry right 
>> after list_empty_careful and by this can grab pointer to the same job 
>> as concurrent drm_sched_job_timedout->list_first_entry_or_null - so 
>> yes I see now i have to use locking as you advised there and then i 
>> don't need the list_empty_careful.
>>
>> Andrey
>>
>> On 11/18/19 11:44 AM, Christian König wrote:
>>> list_empty_careful() should only be used for optimizing cases, but 
>>> never if you need to rely on the result.
>>>
>>> The problem is that the function doesn't has any memory barriers 
>>> whatsoever, it just checks if the next and prev pointer are both 
>>> empty instead of just the next pointer.
>>>
>>> Christian.
>>>
>>> Am 18.11.19 um 17:23 schrieb Andrey Grodzovsky:
>>>> Can you explain why ? As I see it - list_empty_careful is 
>>>> specifically designed for the case where the only other concurrent 
>>>> operation in progress is list_del_init 
>>>> (https://www.kernel.org/doc/htmldocs/kernel-api/API-list-empty-careful.html) 
>>>> - which is exactly what happens in this patch, no other list 
>>>> altering operation can take place concurrently - so it looks safe 
>>>> to use for me.
>>>>
>>>> Andrey
>>>>
>>>> On 11/18/19 11:16 AM, Christian König wrote:
>>>>> Hi Andrey,
>>>>>
>>>>> the only thing which doesn't looks so good is the switch to 
>>>>> list_empty_careful in drm_sched_cleanup_jobs.
>>>>>
>>>>> We either take the lock here or we don't, but please not that 
>>>>> extra checking.
>>>>>
>>>>> Christian.
>>>>>
>>>>> Am 18.11.19 um 15:07 schrieb Andrey Grodzovsky:
>>>>>> Thanks Emily.
>>>>>>
>>>>>> Christan - ping for review.
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>> On 11/14/19 11:39 PM, Deng, Emily wrote:
>>>>>>> Hi Andrey,
>>>>>>>       Currently, I am busying with another issue, maybe will try 
>>>>>>> next week.
>>>>>>>
>>>>>>> Best wishes
>>>>>>> Emily Deng
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>> Sent: Friday, November 15, 2019 6:14 AM
>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Deng, Emily
>>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer issue for 
>>>>>>>> tdr
>>>>>>>>
>>>>>>>> Attached.
>>>>>>>>
>>>>>>>> Emily - can you give it a try ?
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>> On 11/14/19 3:12 AM, Christian König wrote:
>>>>>>>>>> What about instead of peeking at the job to actually remove 
>>>>>>>>>> it from
>>>>>>>>>> ring_mirror_list right there,
>>>>>>>>> Also an interesting idea. We would need to protect the mirror 
>>>>>>>>> list
>>>>>>>>> with a lock again, but that should be the lesser evil.
>>>>>>>>>
>>>>>>>>> Maybe prototype that and see if it works or not.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>> Am 13.11.19 um 17:00 schrieb Andrey Grodzovsky:
>>>>>>>>>>
>>>>>>>>>> On 11/13/19 9:20 AM, Christian König wrote:
>>>>>>>>>>> Another more fundamental question: Could we get rid of the 
>>>>>>>>>>> timeout
>>>>>>>>>>> job at all?
>>>>>>>>>>
>>>>>>>>>> There are other stuff there besides picking the first 
>>>>>>>>>> unfinished job
>>>>>>>>>> which is common for all the drivers - such as freeing guilty 
>>>>>>>>>> signaled
>>>>>>>>>> job and rearming the timeout work timer.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> I mean we used to give this as parameter to the scheduler 
>>>>>>>>>>> callback
>>>>>>>>>>> because we had the timeout worker in the job, but that is no 
>>>>>>>>>>> longer
>>>>>>>>>>> the case.
>>>>>>>>>>>
>>>>>>>>>>> E.g. in drm_sched_job_timedout() we do the following:
>>>>>>>>>>>>          job = 
>>>>>>>>>>>> list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>>>>>>> struct drm_sched_job, node);
>>>>>>>>>>> Why don't we just remove that here and only get the first 
>>>>>>>>>>> job after
>>>>>>>>>>> we have stopped the scheduler?
>>>>>>>>>>
>>>>>>>>>> Should be ok since we have the extra check for 
>>>>>>>>>> __kthread_should_park
>>>>>>>>>> in drm_sched_cleanup_jobs which will protect us in this case 
>>>>>>>>>> from a
>>>>>>>>>> wakeup of sched thread and execution of in 
>>>>>>>>>> drm_sched_cleanup_jobs
>>>>>>>>>> after we already parked it. The problem here is we need the
>>>>>>>>>> drm_sched_job to access the private data for each client 
>>>>>>>>>> driver (see
>>>>>>>>>> amdgpu_job_timedout for example). What about instead of 
>>>>>>>>>> peeking at
>>>>>>>>>> the job to actually remove it from ring_mirror_list right 
>>>>>>>>>> there, go
>>>>>>>>>> ahead with it through the reset routine, if it's signaled in the
>>>>>>>>>> meanwhile that great - release it, otherwise put it back into
>>>>>>>>>> ring_mirror_list in drm_sched_resubmit_jobs.
>>>>>>>>>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>> Am 13.11.19 um 15:12 schrieb Andrey Grodzovsky:
>>>>>>>>>>>> This why I asked for a trace with timer enabled, but since 
>>>>>>>>>>>> there is
>>>>>>>>>>>> a finite number of places we touch the timer Emily can just 
>>>>>>>>>>>> put
>>>>>>>>>>>> prints there. Also, I wonder if this temp fix helps her 
>>>>>>>>>>>> with the
>>>>>>>>>>>> issue or not.
>>>>>>>>>>>>
>>>>>>>>>>>> Andrey
>>>>>>>>>>>>
>>>>>>>>>>>> On 11/13/19 2:36 AM, Christian König wrote:
>>>>>>>>>>>>> The question is where do we rearm the timer for this 
>>>>>>>>>>>>> problem to
>>>>>>>>>>>>> occur?
>>>>>>>>>>>>>
>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Am 12.11.19 um 20:21 schrieb Andrey Grodzovsky:
>>>>>>>>>>>>>> I was able to reproduce the crash by using the attached
>>>>>>>>>>>>>> simulate_crash.patch - waiting on guilty job to signal in 
>>>>>>>>>>>>>> reset
>>>>>>>>>>>>>> work and artificially rearming the timeout timer just 
>>>>>>>>>>>>>> before the
>>>>>>>>>>>>>> check for !cancel_delayed_work(&sched->work_tdr) in
>>>>>>>>>>>>>> drm_sched_cleanup_jobs - crash log attached in crash.log. 
>>>>>>>>>>>>>> This I
>>>>>>>>>>>>>> think confirms my theory i described earlier in this thread.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> basic_fix.patch handles this by testing whether another 
>>>>>>>>>>>>>> timer
>>>>>>>>>>>>>> already armed ob this scheduler or is there a timeout 
>>>>>>>>>>>>>> work in
>>>>>>>>>>>>>> execution right now (see documentation for work_busy) - 
>>>>>>>>>>>>>> obviously
>>>>>>>>>>>>>> this is not a full solution as this will not protect from 
>>>>>>>>>>>>>> races
>>>>>>>>>>>>>> if for example there is immediate work scheduling such as in
>>>>>>>>>>>>>> drm_sched_fault -  so we probably need to account for 
>>>>>>>>>>>>>> this by
>>>>>>>>>>>>>> making drm_sched_cleanup_jobs (at least in the part where it
>>>>>>>>>>>>>> iterates ring mirror list and frees jobs) and GPU reset 
>>>>>>>>>>>>>> really
>>>>>>>>>>>>>> mutually exclusive and not like now.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On 11/11/19 4:11 PM, Christian König wrote:
>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> you need to print which scheduler instance is freeing 
>>>>>>>>>>>>>>> the jobs
>>>>>>>>>>>>>>> and which one is triggering the reset. The TID and PID is
>>>>>>>>>>>>>>> completely meaningless here since we are called from 
>>>>>>>>>>>>>>> different
>>>>>>>>>>>>>>> worker threads and the TID/PID can change on each call.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Apart from that I will look into this a bit deeper when 
>>>>>>>>>>>>>>> I have
>>>>>>>>>>>>>>> time.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Am 12.11.19 um 07:02 schrieb Deng, Emily:
>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>      I add the follow print in function 
>>>>>>>>>>>>>>>> drm_sched_cleanup_jobs.
>>>>>>>>>>>>>>>>  From the log it shows that only use 
>>>>>>>>>>>>>>>> cancel_delayed_work could
>>>>>>>>>>>>>>>> not avoid to free job when the sched is in reset. But 
>>>>>>>>>>>>>>>> don’t
>>>>>>>>>>>>>>>> know exactly where it is wrong about the driver. Do you 
>>>>>>>>>>>>>>>> have
>>>>>>>>>>>>>>>> any suggestion about this?
>>>>>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs:begin,tid:%lu,
>>>>>>>>>>>>>>>> pid:%lu\n", current->tgid, current->pid);
>>>>>>>>>>>>>>>>          /*
>>>>>>>>>>>>>>>>           * Don't destroy jobs while the timeout worker is
>>>>>>>>>>>>>>>> running  OR thread
>>>>>>>>>>>>>>>>           * is being parked and hence assumed to not touch
>>>>>>>>>>>>>>>> ring_mirror_list
>>>>>>>>>>>>>>>>           */
>>>>>>>>>>>>>>>>           if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr)))
>>>>>>>>>>>>>>>>                  return;
>>>>>>>>>>>>>>>> + printk("Emily:drm_sched_cleanup_jobs,tid:%lu, 
>>>>>>>>>>>>>>>> pid:%lu\n",
>>>>>>>>>>>>>>>> current->tgid, current->pid);
>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11380.695091] 
>>>>>>>>>>>>>>>> Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11380.695104] 
>>>>>>>>>>>>>>>> Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11380.695105] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11380.695107] 
>>>>>>>>>>>>>>>> Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11380.695107] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.222954] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>> ring
>>>>>>>>>>>>>>>> sdma0 timeout, signaled seq=78585, emitted seq=78587 
>>>>>>>>>>>>>>>> Nov 12
>>>>>>>>>>>>>>>> 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>>>>> [11381.224275] [drm:amdgpu_job_timedout [amdgpu]] *ERROR*
>>>>>>>>>>>>>>>> Process information: process pid 0 thread pid 0,
>>>>>>>>>>>>>>>> s_job:00000000fe75ab36,tid=15603, pid=15603 Nov 12 
>>>>>>>>>>>>>>>> 12:58:20
>>>>>>>>>>>>>>>> ubuntu-drop-August-2018-rc2-gpu0-vf02 kernel:
>>>>>>>>>>>>>>>> [11381.225413] amdgpu 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225417] 
>>>>>>>>>>>>>>>> Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225425] 
>>>>>>>>>>>>>>>> Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225425] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225428] Emily:amdgpu_job_free_cb,Process 
>>>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000fe75ab36, 
>>>>>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225429] 
>>>>>>>>>>>>>>>> Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225430] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225473] 
>>>>>>>>>>>>>>>> Emily:drm_sched_cleanup_jobs:begin,tid:2253,
>>>>>>>>>>>>>>>> pid:2253
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225486] 
>>>>>>>>>>>>>>>> Emily:drm_sched_cleanup_jobs:begin,tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225489] Emily:drm_sched_cleanup_jobs,tid:2262, 
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>> Nov 12 12:58:20 ubuntu-drop-August-2018-rc2-gpu0-vf02 
>>>>>>>>>>>>>>>> kernel:
>>>>>>>>>>>>>>>> [11381.225494] Emily:amdgpu_job_free_cb,Process 
>>>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>> process  pid 0 thread  pid 0, s_job:00000000f086ec84, 
>>>>>>>>>>>>>>>> tid:2262,
>>>>>>>>>>>>>>>> pid:2262
>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>>>>>>>>>>>>>>>> Sent: Tuesday, November 12, 2019 11:28 AM
>>>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; 
>>>>>>>>>>>>>>>>> Deng, Emily
>>>>>>>>>>>>>>>>> <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>>>>>>>>>>> issue
>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>> Thinking more about this claim - we assume here that if
>>>>>>>>>>>>>>>> cancel_delayed_work
>>>>>>>>>>>>>>>>> returned true it guarantees that timeout work is not 
>>>>>>>>>>>>>>>>> running
>>>>>>>>>>>>>>>> but, it merely
>>>>>>>>>>>>>>>>> means there was a pending timeout work which was removed
>>>>>>>> from
>>>>>>>>>>>>>>>>> the workqueue before it's timer elapsed and so it 
>>>>>>>>>>>>>>>>> didn't have
>>>>>>>>>>>>>>>>> a
>>>>>>>>>>>>>>>> chance to be
>>>>>>>>>>>>>>>>> dequeued and executed, it doesn't cover already executing
>>>>>>>>>>>>>>>> work. So there is a
>>>>>>>>>>>>>>>>> possibility where while timeout work started executing 
>>>>>>>>>>>>>>>>> another
>>>>>>>>>>>>>>>> timeout work
>>>>>>>>>>>>>>>>> already got enqueued (maybe through earlier cleanup 
>>>>>>>>>>>>>>>>> jobs or
>>>>>>>>>>>>>>>> through
>>>>>>>>>>>>>>>>> drm_sched_fault) and if at this point another
>>>>>>>>>>>>>>>> drm_sched_cleanup_jobs runs
>>>>>>>>>>>>>>>>> cancel_delayed_work(&sched->work_tdr) will return true 
>>>>>>>>>>>>>>>>> even
>>>>>>>>>>>>>>>> while there is a
>>>>>>>>>>>>>>>>> timeout job in progress.
>>>>>>>>>>>>>>>>> Unfortunately we cannot change cancel_delayed_work to
>>>>>>>>>>>>>>>>> cancel_delayed_work_sync to flush the timeout work as 
>>>>>>>>>>>>>>>>> timeout
>>>>>>>>>>>>>>>> work itself
>>>>>>>>>>>>>>>>> waits for schedule thread  to be parked again when 
>>>>>>>>>>>>>>>>> calling
>>>>>>>>>>>>>>>> park_thread.
>>>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> ________________________________________
>>>>>>>>>>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on
>>>>>>>>>>>>>>>> behalf of
>>>>>>>>>>>>>>>>> Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>> Sent: 08 November 2019 05:35:18
>>>>>>>>>>>>>>>>> To: Deng, Emily; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer 
>>>>>>>>>>>>>>>>> issue
>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> exactly that can't happen. See here:
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> /* Don't destroy jobs while the timeout worker is
>>>>>>>>>>>>>>>> running */
>>>>>>>>>>>>>>>>>> if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>>>>>>>>>>>>>>>> !cancel_delayed_work(&sched->work_tdr))
>>>>>>>>>>>>>>>>>>                   return NULL;
>>>>>>>>>>>>>>>>> We never free jobs while the timeout working is 
>>>>>>>>>>>>>>>>> running to
>>>>>>>>>>>>>>>> prevent exactly
>>>>>>>>>>>>>>>>> that issue.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:32 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>>>         The drm_sched_job_timedout-> 
>>>>>>>>>>>>>>>>>> amdgpu_job_timedout call
>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover. I mean the main scheduler 
>>>>>>>>>>>>>>>>> free the
>>>>>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>>>>> in amdgpu_device_gpu_recover, and before calling
>>>>>>>> drm_sched_stop.
>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:26 PM
>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> well who is calling amdgpu_device_gpu_recover() in 
>>>>>>>>>>>>>>>>>>> this case?
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> When it's not the scheduler we shouldn't have a 
>>>>>>>>>>>>>>>>>>> guilty job
>>>>>>>>>>>>>>>> in the first place.
>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:22 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>> Hi Chrisitan,
>>>>>>>>>>>>>>>>>>>>          No, I am with the new branch and also has the
>>>>>>>>>>>>>>>> patch. Even it
>>>>>>>>>>>>>>>>>>>> are freed by
>>>>>>>>>>>>>>>>>>> main scheduler, how we could avoid main scheduler to 
>>>>>>>>>>>>>>>>>>> free
>>>>>>>>>>>>>>>> jobs while
>>>>>>>>>>>>>>>>>>> enter to function amdgpu_device_gpu_recover?
>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 6:15 PM
>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null pointer
>>>>>>>>>>>>>>>> issue for tdr
>>>>>>>>>>>>>>>>>>>>> Hi Emily,
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> in this case you are on an old code branch.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Jobs are freed now by the main scheduler thread 
>>>>>>>>>>>>>>>>>>>>> and only
>>>>>>>>>>>>>>>> if no
>>>>>>>>>>>>>>>>>>>>> timeout handler is running.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> See this patch here:
>>>>>>>>>>>>>>>>>>>>>> commit 5918045c4ed492fb5813f980dcf89a90fefd0a4e
>>>>>>>>>>>>>>>>>>>>>> Author: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>>> Date:   Thu Apr 18 11:00:21 2019 -0400
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>         drm/scheduler: rework job destruction
>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 11:11 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>>>>>>>>>>>           Please refer to follow log, when it 
>>>>>>>>>>>>>>>>>>>>>> enter to
>>>>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover
>>>>>>>>>>>>>>>>>>>>> function, the bad job 000000005086879e is freeing in
>>>>>>>>>>>>>>>> function
>>>>>>>>>>>>>>>>>>>>> amdgpu_job_free_cb at the same time, because of the
>>>>>>>>>>>>>>>> hardware fence
>>>>>>>>>>>>>>>>>>> signal.
>>>>>>>>>>>>>>>>>>>>> But amdgpu_device_gpu_recover goes faster, at this 
>>>>>>>>>>>>>>>>>>>>> case,
>>>>>>>>>>>>>>>>>>>>> the s_fence is already freed, but job is not freed 
>>>>>>>>>>>>>>>>>>>>> in time.
>>>>>>>>>>>>>>>> Then this issue
>>>>>>>>>>>>>>>>> occurs.
>>>>>>>>>>>>>>>>>>>>>> [ 449.792189] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>>>>>>>>>> *ERROR* ring
>>>>>>>>>>>>>>>>>>> sdma0
>>>>>>>>>>>>>>>>>>>>>> timeout, signaled seq=2481, emitted seq=2483 [
>>>>>>>>>>>>>>>>>>>>>> 449.793202] [drm:amdgpu_job_timedout [amdgpu]]
>>>>>>>> *ERROR*
>>>>>>>>>>>>>>>>>>>>>> Process
>>>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>>>>> process pid 0 thread pid 0, s_job:000000005086879e [
>>>>>>>>>>>>>>>> 449.794163]
>>>>>>>>>>>>>>>>>>>>> amdgpu
>>>>>>>>>>>>>>>>>>>>> 0000:00:08.0: GPU reset begin!
>>>>>>>>>>>>>>>>>>>>>> [ 449.794175] Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>>> information:
>>>>>>>>>>>>>>>>>>>>>> process pid 0 thread  pid 0, 
>>>>>>>>>>>>>>>>>>>>>> s_job:000000005086879e [
>>>>>>>>>>>>>>>> 449.794221]
>>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: 
>>>>>>>>>>>>>>>>>>>>>> process
>>>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:0000000066eb74ab [ 449.794222]
>>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: 
>>>>>>>>>>>>>>>>>>>>>> process
>>>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000d4438ad9 [ 449.794255]
>>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: 
>>>>>>>>>>>>>>>>>>>>>> process
>>>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>>>> thread pid 0, s_job:00000000b6d69c65 [ 449.794257]
>>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process information: 
>>>>>>>>>>>>>>>>>>>>>> process
>>>>>>>>>>>>>>>> pid 0
>>>>>>>>>>>>>>>>>>>>>> thread pid 0,
>>>>>>>>>>>>>>>>>>>>> s_job:00000000ea85e922 [ 449.794287]
>>>>>>>>>>>>>>>>>>>>> Emily:amdgpu_job_free_cb,Process
>>>>>>>>>>>>>>>>>>>>> information: process pid 0 thread pid 0,
>>>>>>>>>>>>>>>> s_job:00000000ed3a5ac6 [
>>>>>>>>>>>>>>>>>>>>> 449.794366] BUG: unable to handle kernel NULL pointer
>>>>>>>>>>>>>>>> dereference
>>>>>>>>>>>>>>>>>>>>> at
>>>>>>>>>>>>>>>>>>>>> 00000000000000c0 [ 449.800818] PGD 0 P4D 0
>>>>>>>> [  449.801040]
>>>>>>>>>>>>>>>> Oops:
>>>>>>>>>>>>>>>>>>>>> 0000 [#1] SMP PTI
>>>>>>>>>>>>>>>>>>>>>> [ 449.801338] CPU: 3 PID: 55 Comm: kworker/3:1 
>>>>>>>>>>>>>>>>>>>>>> Tainted:
>>>>>>>>>>>>>>>> G OE
>>>>>>>>>>>>>>>>>>>>> 4.18.0-15-generic #16~18.04.1-Ubuntu
>>>>>>>>>>>>>>>>>>>>>> [ 449.802157] Hardware name: QEMU Standard PC 
>>>>>>>>>>>>>>>>>>>>>> (i440FX
>>>>>>>> +
>>>>>>>>>>>>>>>> PIIX,
>>>>>>>>>>>>>>>>>>>>>> 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [
>>>>>>>>>>>>>>>>>>>>>> 449.802944]
>>>>>>>>>>>>>>>>>>>>>> Workqueue: events drm_sched_job_timedout [amd_sched]
>>>>>>>> [
>>>>>>>>>>>>>>>>>>>>>> 449.803488]
>>>>>>>>>>>>>>>>>>> RIP:
>>>>>>>>>>>>>>>>>>>>> 0010:amdgpu_device_gpu_recover+0x1da/0xb60 [amdgpu]
>>>>>>>>>>>>>>>>>>>>>> [ 449.804020] Code: dd ff ff 49 39 c5 48 89 55 a8 
>>>>>>>>>>>>>>>>>>>>>> 0f 85
>>>>>>>>>>>>>>>> 56 ff ff
>>>>>>>>>>>>>>>>>>>>>> ff
>>>>>>>>>>>>>>>>>>>>>> 45 85 e4 0f
>>>>>>>>>>>>>>>>>>>>> 85 a1 00 00 00 48 8b 45 b0 48 85 c0 0f 84 60 01 00 
>>>>>>>>>>>>>>>>>>>>> 00 48
>>>>>>>>>>>>>>>> 8b 40 10
>>>>>>>>>>>>>>>>>>>>> <48> 8b
>>>>>>>>>>>>>>>>>>> 98
>>>>>>>>>>>>>>>>>>>>> c0 00 00 00 48 85 db 0f 84 4c 01 00 00 48 8b 43
>>>>>>>>>>>>>>>> 48 a8 01
>>>>>>>>>>>>>>>>>>>>>> [ 449.805593] RSP: 0018:ffffb4c7c08f7d68 EFLAGS:
>>>>>>>>>>>>>>>> 00010286 [
>>>>>>>>>>>>>>>>>>>>>> 449.806032] RAX: 0000000000000000 RBX:
>>>>>>>> 0000000000000000
>>>>>>>>>>>>>>>> RCX:
>>>>>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.806625] RDX: ffffb4c7c08f5ac0
>>>>>>>> RSI:
>>>>>>>>>>>>>>>>>>>>>> 0000000fffffffe0 RDI: 0000000000000246 [ 449.807224]
>>>>>>>> RBP:
>>>>>>>>>>>>>>>>>>>>>> ffffb4c7c08f7de0 R08: 00000068b9d54000 R09:
>>>>>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>>>>>> 449.807818] R10: 0000000000000000 R11:
>>>>>>>> 0000000000000148
>>>>>>>>>>>>>>>> R12:
>>>>>>>>>>>>>>>>>>>>>> 0000000000000000 [ 449.808411] R13: ffffb4c7c08f7da0
>>>>>>>> R14:
>>>>>>>>>>>>>>>>>>>>>> ffff8d82b8525d40 R15: ffff8d82b8525d40 [ 
>>>>>>>>>>>>>>>>>>>>>> 449.809004] FS:
>>>>>>>>>>>>>>>>>>>>>> 0000000000000000(0000) GS:ffff8d82bfd80000(0000)
>>>>>>>>>>>>>>>>>>>>>> knlGS:0000000000000000 [ 449.809674] CS:  0010 
>>>>>>>>>>>>>>>>>>>>>> DS: 0000
>>>>>>>>>>>>>>>> ES: 0000
>>>>>>>>>>>>>>>>> CR0:
>>>>>>>>>>>>>>>>>>>>>> 0000000080050033 [ 449.810153] CR2: 00000000000000c0
>>>>>>>> CR3:
>>>>>>>>>>>>>>>>>>>>>> 000000003cc0a001 CR4: 00000000003606e0 [ 449.810747]
>>>>>>>> DR0:
>>>>>>>>>>>>>>>>>>>>> 0000000000000000 DR1: 0000000000000000 DR2:
>>>>>>>>>>>>>>>> 0000000000000000 [
>>>>>>>>>>>>>>>>>>>>> 449.811344] DR3: 0000000000000000 DR6:
>>>>>>>> 00000000fffe0ff0 DR7:
>>>>>>>>>>>>>>>>>>>>> 0000000000000400 [ 449.811937] Call Trace:
>>>>>>>>>>>>>>>>>>>>>> [ 449.812206] amdgpu_job_timedout+0x114/0x140
>>>>>>>> [amdgpu]
>>>>>>>>>>>>>>>>>>>>>> [ 449.812635] drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>>>>>>>>> [amd_sched] [ 449.813139] ?
>>>>>>>>>>>>>>>>>>>>>> amdgpu_cgs_destroy_device+0x10/0x10
>>>>>>>>>>>>>>>> [amdgpu] [
>>>>>>>>>>>>>>>>>>>>>> 449.813609] ? drm_sched_job_timedout+0x44/0x90
>>>>>>>>>>>>>>>> [amd_sched] [
>>>>>>>>>>>>>>>>>>>>>> 449.814077] process_one_work+0x1fd/0x3f0 [ 
>>>>>>>>>>>>>>>>>>>>>> 449.814417]
>>>>>>>>>>>>>>>>>>>>>> worker_thread+0x34/0x410 [ 449.814728]
>>>>>>>>>>>>>>>> kthread+0x121/0x140 [
>>>>>>>>>>>>>>>>>>>>>> 449.815004] ? process_one_work+0x3f0/0x3f0 [
>>>>>>>>>>>>>>>> 449.815374]  ?
>>>>>>>>>>>>>>>>>>>>>> kthread_create_worker_on_cpu+0x70/0x70
>>>>>>>>>>>>>>>>>>>>>> [  449.815799] ret_from_fork+0x35/0x40
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:43 PM
>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>>>>>>>>>>>>>>>>> pointer
>>>>>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 10:39 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>>>> Sorry, please take your time.
>>>>>>>>>>>>>>>>>>>>>>> Have you seen my other response a bit below?
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> I can't follow how it would be possible for
>>>>>>>>>>>>>>>> job->s_fence to be
>>>>>>>>>>>>>>>>>>>>>>> NULL without the job also being freed.
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> So it looks like this patch is just papering 
>>>>>>>>>>>>>>>>>>>>>>> over some
>>>>>>>>>>>>>>>> bigger issues.
>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>>> From: Koenig, Christian 
>>>>>>>>>>>>>>>>>>>>>>>>> <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 5:08 PM
>>>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-
>>>>>>>>>>>>>>>>>>>>> gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null 
>>>>>>>>>>>>>>>>>>>>>>>>> pointer
>>>>>>>>>>>>>>>> issue for
>>>>>>>>>>>>>>>>>>>>>>>>> tdr
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> Am 08.11.19 um 09:52 schrieb Deng, Emily:
>>>>>>>>>>>>>>>>>>>>>>>>>> Ping.....
>>>>>>>>>>>>>>>>>>>>>>>>> You need to give me at least enough time to 
>>>>>>>>>>>>>>>>>>>>>>>>> wake up
>>>>>>>>>>>>>>>>>>>>>>>>> :)
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>>>>> From: amd-gfx
>>>>>>>>>>>>>>>> <amd-gfx-bounces@lists.freedesktop.org> On
>>>>>>>>>>>>>>>>>>> Behalf
>>>>>>>>>>>>>>>>>>>>>>>>>>> Of Deng, Emily
>>>>>>>>>>>>>>>>>>>>>>>>>>> Sent: Friday, November 8, 2019 10:56 AM
>>>>>>>>>>>>>>>>>>>>>>>>>>> To: Koenig, Christian 
>>>>>>>>>>>>>>>>>>>>>>>>>>> <Christian.Koenig@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>>>>> amd- gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>>>> Subject: RE: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>>>>>>>>>>>> From: Christian König
>>>>>>>>>>>>>>>> <ckoenig.leichtzumerken@gmail.com>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> Sent: Thursday, November 7, 2019 7:28 PM
>>>>>>>>>>>>>>>>>>>>>>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the null
>>>>>>>>>>>>>>>> pointer issue
>>>>>>>>>>>>>>>>>>>>>>>>>>>> for tdr
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> Am 07.11.19 um 11:25 schrieb Emily Deng:
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> When the job is already signaled, the 
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> s_fence is
>>>>>>>>>>>>>>>> freed.
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Then it will has null pointer in
>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover.
>>>>>>>>>>>>>>>>>>>>>>>>>>>> NAK, the s_fence is only set to NULL when 
>>>>>>>>>>>>>>>>>>>>>>>>>>>> the job
>>>>>>>>>>>>>>>> is destroyed.
>>>>>>>>>>>>>>>>>>>>>>>>>>>> See drm_sched_job_cleanup().
>>>>>>>>>>>>>>>>>>>>>>>>>>> I know it is set to NULL in 
>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup.
>>>>>>>>>>>>>>>> But in one
>>>>>>>>>>>>>>>>>>>>>>>>>>> case, when it enter into the
>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover, it
>>>>>>>>>>>>>>>>>>>>>>>>>>> already in drm_sched_job_cleanup, and at 
>>>>>>>>>>>>>>>>>>>>>>>>>>> this time,
>>>>>>>>>>>>>>>> it will
>>>>>>>>>>>>>>>>>>>>>>>>>>> go to free
>>>>>>>>>>>>>>>>>>>>> job.
>>>>>>>>>>>>>>>>>>>>>>>>>>> But the amdgpu_device_gpu_recover sometimes is
>>>>>>>>>>>>>>>> faster. At
>>>>>>>>>>>>>>>>>>>>>>>>>>> that time, job is not freed, but s_fence is 
>>>>>>>>>>>>>>>>>>>>>>>>>>> already
>>>>>>>>>>>>>>>> NULL.
>>>>>>>>>>>>>>>>>>>>>>>>> No, that case can't happen. See here:
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job_cleanup(s_job);
>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_ring_priority_put(ring, 
>>>>>>>>>>>>>>>>>>>>>>>>>> s_job->s_priority);
>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_put(job->fence);
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sync);
>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_sync_free(&job->sched_sync);
>>>>>>>>>>>>>>>>>>>>>>>>>> kfree(job);
>>>>>>>>>>>>>>>>>>>>>>>>> The job itself is freed up directly after 
>>>>>>>>>>>>>>>>>>>>>>>>> freeing the
>>>>>>>>>>>>>>>> reference
>>>>>>>>>>>>>>>>>>>>>>>>> to the
>>>>>>>>>>>>>>>>>>>>> s_fence.
>>>>>>>>>>>>>>>>>>>>>>>>> So you are just papering over a much bigger 
>>>>>>>>>>>>>>>>>>>>>>>>> problem
>>>>>>>>>>>>>>>> here. This
>>>>>>>>>>>>>>>>>>>>>>>>> patch is a clear NAK.
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> When you see a job without an s_fence then 
>>>>>>>>>>>>>>>>>>>>>>>>>>>> that
>>>>>>>>>>>>>>>> means the
>>>>>>>>>>>>>>>>>>>>>>>>>>>> problem is somewhere else.
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>> Regards,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Emily Deng 
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> |  2
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +- drivers/gpu/drm/scheduler/sched_main.c |
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 11
>>>>>>>>>>>>>>>> ++++++---
>>>>>>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 2 files changed, 7 insertions(+), 6
>>>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> index e6ce949..5a8f08e 100644
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +++
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -4075,7 +4075,7 @@ int
>>>>>>>>>>>>>>>>>>> amdgpu_device_gpu_recover(struct
>>>>>>>>>>>>>>>>>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>              * job->base holds a reference to
>>>>>>>>>>>>>>>> parent fence
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -  if (job && job->base.s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +  if (job && job->base.s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + job->base.s_fence->parent
>>>>>>>>>>>>>>>>>>>>>>> &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_is_signaled(job->base.s_fence-
>>>>>>>>> parent))
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> job_signaled = true;
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> index 31809ca..56cc10e 100644
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -334,8 +334,8 @@ void
>>>>>>>>>>>>>>>>> drm_sched_increase_karma(struct
>>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_job
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> spin_lock(&rq->lock);
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(entity, tmp,
>>>>>>>>>>>>>>>>>>> &rq-
>>>>>>>>>>>>>>>>>>>>>>>> entities,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> list) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - if
>>>>>>>>>>>>>>>> (bad->s_fence->scheduled.context
>>>>>>>>>>>>>>>>>>>>>>> ==
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - entity->fence_context) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +                          if 
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> (bad->s_fence &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + (bad->s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> scheduled.context ==
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + entity->fence_context)) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> (atomic_read(&bad-
>>>>>>>>>>>>>>>>>>>>>>>> karma) >
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> bad->sched-
>>>>>>>>>>>>>>>>>>>> hang_limit)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> if
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> (entity-
>>>>>>>>>>>>>>>>>>>> guilty) @@ -376,7 +376,7 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct
>>>>>>>>>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> * This iteration is thread safe as
>>>>>>>>>>>>>>>> sched thread
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> is
>>>>>>>>>>>>>>>>>>> stopped.
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe_reverse(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> &sched- ring_mirror_list, node) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -          if (s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +          if (s_job->s_fence &&
>>>>>>>>>>>>>>>> s_job->s_fence->parent &&
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> dma_fence_remove_callback(s_job-
>>>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>>> parent,
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> &s_job->cb)) {
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_dec(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>>> @@ -
>>>>>>>>>>>>>>>>>>>>>>> 395,7
>>>>>>>>>>>>>>>>>>>>>>>>>>> +395,8 @@ void
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> drm_sched_stop(struct drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>>>>>>>>> *sched, struct drm_sched_job *bad)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> *
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> * Job is still alive so fence refcount at
>>>>>>>>>>>>>>>>>>> least 1
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - dma_fence_wait(&s_job->s_fence->finished,
>>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + if (s_job->s_fence)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + dma_fence_wait(&s_job->s_fence-
>>>>>>>>>>>>>>>>>>>>>>>> finished,
>>>>>>>>>>>>>>>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> /*
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> * We must keep bad job alive for later
>>>>>>>>>>>>>>>>>>> use
>>>>>>>>>>>>>>>>>>>>>>> during @@
>>>>>>>>>>>>>>>>>>>>>>>>>>>> -438,7
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +439,7 @@ void drm_sched_start(struct
>>>>>>>>>>>>>>>> drm_gpu_scheduler
>>>>>>>>>>>>>>>>>>>>> *sched,
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +bool
>>>>>>>>>>>>>>>>>>>>>>>>>>>> full_recovery)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> * GPU recovers can't run in parallel.
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> list_for_each_entry_safe(s_job, tmp,
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> &sched->ring_mirror_list,
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> node)
>>>>>>>>>>>>>>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - struct dma_fence *fence =
>>>>>>>>>>>>>>>> s_job->s_fence->parent;
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + struct dma_fence *fence =
>>>>>>>>>>>>>>>> s_job->s_fence ?
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + s_job-
>>>>>>>>>>>>>>>>>>>>>>>> s_fence-
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> parent :
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +NULL;
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>> atomic_inc(&sched->hw_rq_count);
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>> _______________________________________________
>>>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx mailing list 
>>>>>>>>>>>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>>>>>>> <https://lists.freedesktop.org/mailman/listinfo/amd-gfx>
>>>>>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>>>>>> _______________________________________________
>>>>>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>>>>>> _______________________________________________
>>>>>>>>>> amd-gfx mailing list
>>>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>>
>>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 80+ messages in thread

end of thread, other threads:[~2019-11-18 21:05 UTC | newest]

Thread overview: 80+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-07 10:25 [PATCH] drm/amdgpu: Fix the null pointer issue for tdr Emily Deng
2019-11-07 10:25 ` Emily Deng
     [not found] ` <1573122349-22080-1-git-send-email-Emily.Deng-5C7GfCeVMHo@public.gmane.org>
2019-11-07 11:28   ` Christian König
2019-11-07 11:28     ` Christian König
     [not found]     ` <9de32e5b-69a2-f43f-629f-fef3c30bf5a1-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-11-08  2:55       ` Deng, Emily
2019-11-08  2:55         ` Deng, Emily
     [not found]         ` <MN2PR12MB2975D4E26CED960B82305F308F7B0-rweVpJHSKToFlvJWC7EAqwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-08  8:52           ` Deng, Emily
2019-11-08  8:52             ` Deng, Emily
     [not found]             ` <MN2PR12MB2975E26D8A8352863BA01FCA8F7B0-rweVpJHSKToFlvJWC7EAqwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-08  9:07               ` Koenig, Christian
2019-11-08  9:07                 ` Koenig, Christian
     [not found]                 ` <c01acb29-72ce-a109-3ca5-166706327d61-5C7GfCeVMHo@public.gmane.org>
2019-11-08  9:39                   ` Deng, Emily
2019-11-08  9:39                     ` Deng, Emily
     [not found]                     ` <MN2PR12MB29755CFCE09CEC0D9EB999D18F7B0-rweVpJHSKToFlvJWC7EAqwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-08  9:42                       ` Koenig, Christian
2019-11-08  9:42                         ` Koenig, Christian
     [not found]                         ` <70c2c1cc-40b8-30da-7aee-f59fbc4d0d42-5C7GfCeVMHo@public.gmane.org>
2019-11-08 10:11                           ` Deng, Emily
2019-11-08 10:11                             ` Deng, Emily
     [not found]                             ` <DM6PR12MB2971859C1BF16EE7E65B35B18F7B0-lmeGfMZKVrGd4IXjMPYtUQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-08 10:14                               ` Koenig, Christian
2019-11-08 10:14                                 ` Koenig, Christian
     [not found]                                 ` <d6f9c508-3c23-c797-1cbc-7502dc4c0b13-5C7GfCeVMHo@public.gmane.org>
2019-11-08 10:22                                   ` Deng, Emily
2019-11-08 10:22                                     ` Deng, Emily
     [not found]                                     ` <DM6PR12MB29714AB9AD16FA3ABD7D62C28F7B0-lmeGfMZKVrGd4IXjMPYtUQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-08 10:26                                       ` Koenig, Christian
2019-11-08 10:26                                         ` Koenig, Christian
     [not found]                                         ` <dcc1124b-5e19-b018-7449-659a8b7d74ea-5C7GfCeVMHo@public.gmane.org>
2019-11-08 10:32                                           ` Deng, Emily
2019-11-08 10:32                                             ` Deng, Emily
     [not found]                                             ` <DM6PR12MB29710DFE90F22F5903499AFE8F7B0-lmeGfMZKVrGd4IXjMPYtUQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-08 10:35                                               ` Koenig, Christian
2019-11-08 10:35                                                 ` Koenig, Christian
     [not found]                                                 ` <91f4a0c4-23e3-a399-5cb1-fb01da922784-5C7GfCeVMHo@public.gmane.org>
2019-11-08 10:54                                                   ` Deng, Emily
2019-11-08 10:54                                                     ` Deng, Emily
     [not found]                                                     ` <DM6PR12MB2971D540D3000B67E44970AF8F7B0-lmeGfMZKVrGd4IXjMPYtUQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-08 19:04                                                       ` Grodzovsky, Andrey
2019-11-08 19:04                                                         ` Grodzovsky, Andrey
2019-11-08 19:01                                                   ` Grodzovsky, Andrey
2019-11-08 19:01                                                     ` Grodzovsky, Andrey
     [not found]                                                     ` <30ac4863-70e0-2b95-4819-e9431a6b4680-5C7GfCeVMHo@public.gmane.org>
2019-11-11  7:19                                                       ` Deng, Emily
2019-11-11  7:19                                                         ` Deng, Emily
     [not found]                                                         ` <MN2PR12MB2975652B5191BAC055C01BEC8F740-rweVpJHSKToFlvJWC7EAqwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-11  9:05                                                           ` Deng, Emily
2019-11-11  9:05                                                             ` Deng, Emily
     [not found]                                                             ` <MN2PR12MB2975B736A666D9EEC5E5DB158F740-rweVpJHSKToFlvJWC7EAqwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-11 21:35                                                               ` Andrey Grodzovsky
2019-11-11 21:35                                                                 ` Andrey Grodzovsky
     [not found]                                                                 ` <53130d01-da16-7cc0-55df-ea2532e6b3d0-5C7GfCeVMHo@public.gmane.org>
2019-11-12  5:48                                                                   ` Deng, Emily
2019-11-12  5:48                                                                     ` Deng, Emily
2019-11-11 18:06                                                           ` Andrey Grodzovsky
2019-11-11 18:06                                                             ` Andrey Grodzovsky
2019-11-12  3:28                                                   ` Grodzovsky, Andrey
2019-11-12  3:28                                                     ` Grodzovsky, Andrey
     [not found]                                                     ` <MWHPR12MB1453817C6F05A57FD431E159EA770-Gy0DoCVfaSWZBIDmKHdw+wdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-12  6:02                                                       ` Deng, Emily
2019-11-12  6:02                                                         ` Deng, Emily
     [not found]                                                         ` <MN2PR12MB29750EDB35E27DF9CD63152C8F770-rweVpJHSKToFlvJWC7EAqwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-11 21:11                                                           ` Christian König
2019-11-11 21:11                                                             ` Christian König
     [not found]                                                             ` <2f035f22-4057-dd9e-27ef-0f5612113e29-5C7GfCeVMHo@public.gmane.org>
2019-11-12 19:21                                                               ` Andrey Grodzovsky
2019-11-12 19:21                                                                 ` Andrey Grodzovsky
     [not found]                                                                 ` <9269d447-ed32-81f7-ab43-cb16139096e2-5C7GfCeVMHo@public.gmane.org>
2019-11-13  7:36                                                                   ` Christian König
2019-11-13  7:36                                                                     ` Christian König
     [not found]                                                                     ` <33ffe2f1-32b6-a238-3752-cee67cd9e141-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-11-13 14:12                                                                       ` Andrey Grodzovsky
2019-11-13 14:12                                                                         ` Andrey Grodzovsky
     [not found]                                                                         ` <40bb3114-d996-10af-3140-51a4f7c212d6-5C7GfCeVMHo@public.gmane.org>
2019-11-13 14:20                                                                           ` Christian König
2019-11-13 14:20                                                                             ` Christian König
     [not found]                                                                             ` <0858ea1b-d205-006d-a6ec-24b78b33e45b-5C7GfCeVMHo@public.gmane.org>
2019-11-13 16:00                                                                               ` Andrey Grodzovsky
2019-11-13 16:00                                                                                 ` Andrey Grodzovsky
     [not found]                                                                                 ` <c784ef0a-2cd7-d4b1-0581-356d8c401102-5C7GfCeVMHo@public.gmane.org>
2019-11-14  8:12                                                                                   ` Christian König
2019-11-14  8:12                                                                                     ` Christian König
     [not found]                                                                                     ` <088fb2bc-b401-17cc-4d7c-001705ee1eb9-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-11-14 15:53                                                                                       ` Andrey Grodzovsky
2019-11-14 15:53                                                                                         ` Andrey Grodzovsky
2019-11-14 22:14                                                                                       ` Andrey Grodzovsky
2019-11-14 22:14                                                                                         ` Andrey Grodzovsky
     [not found]                                                                                         ` <e267429b-9c80-a9e7-7ffd-75ec439ed759-5C7GfCeVMHo@public.gmane.org>
2019-11-15  4:39                                                                                           ` Deng, Emily
2019-11-15  4:39                                                                                             ` Deng, Emily
     [not found]                                                                                             ` <MN2PR12MB29754C96F982E8C4F5ACC4C08F700-rweVpJHSKToFlvJWC7EAqwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-11-18 14:07                                                                                               ` Andrey Grodzovsky
2019-11-18 14:07                                                                                                 ` Andrey Grodzovsky
     [not found]                                                                                                 ` <c4791437-d42d-31fd-972f-cd2cdb26e951-5C7GfCeVMHo@public.gmane.org>
2019-11-18 16:16                                                                                                   ` Christian König
2019-11-18 16:16                                                                                                     ` Christian König
     [not found]                                                                                                     ` <7963ba8a-e51b-59ce-6c3e-46670e40b27f-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-11-18 16:23                                                                                                       ` Andrey Grodzovsky
2019-11-18 16:23                                                                                                         ` Andrey Grodzovsky
     [not found]                                                                                                         ` <ed7d4065-f83f-3273-5820-e6556e6edc46-5C7GfCeVMHo@public.gmane.org>
2019-11-18 16:44                                                                                                           ` Christian König
2019-11-18 16:44                                                                                                             ` Christian König
     [not found]                                                                                                             ` <2ac04f61-8fe9-62a9-0240-f0bb9f2b1761-5C7GfCeVMHo@public.gmane.org>
2019-11-18 17:01                                                                                                               ` Andrey Grodzovsky
2019-11-18 17:01                                                                                                                 ` Andrey Grodzovsky
     [not found]                                                                                                                 ` <34f789a2-4abd-d6a7-3aa0-fb37e5ba5a86-5C7GfCeVMHo@public.gmane.org>
2019-11-18 20:01                                                                                                                   ` Christian König
2019-11-18 20:01                                                                                                                     ` Christian König
     [not found]                                                                                                                     ` <51b4b317-fa7e-8920-de56-698ce69a8d0a-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-11-18 21:05                                                                                                                       ` Andrey Grodzovsky
2019-11-18 21:05                                                                                                                         ` Andrey Grodzovsky

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.