All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] drm/amdgpu: change reset lock from mutex to rw_semaphore
@ 2020-08-21  3:47 Dennis Li
  2020-08-21  6:10 ` Liu, Monk
  2020-08-21  6:25 ` Christian König
  0 siblings, 2 replies; 3+ messages in thread
From: Dennis Li @ 2020-08-21  3:47 UTC (permalink / raw)
  To: amd-gfx, Alexander.Deucher, felix.kuehling, Hawking.Zhang,
	christian.koenig, monk.liu
  Cc: Dennis Li

clients don't need reset-lock for synchronization when no
GPU recovery.

v2:
change to return the return value of down_read_killable.

v3:
if GPU recovery begin, VF ignore FLR notification.

Signed-off-by: Dennis Li <Dennis.Li@amd.com>

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c8aec832b244..ec11ed2a9ca4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -954,7 +954,7 @@ struct amdgpu_device {
 
 	atomic_t 			in_gpu_reset;
 	enum pp_mp1_state               mp1_state;
-	struct mutex  lock_reset;
+	struct rw_semaphore reset_sem;
 	struct amdgpu_doorbell_index doorbell_index;
 
 	struct mutex			notifier_lock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 79b397800cbc..cc5c7f81c540 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -101,14 +101,18 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
 
 	file->private_data = adev;
 
-	mutex_lock(&adev->lock_reset);
+	ret = down_read_killable(&adev->reset_sem);
+	if (ret)
+		return ret;
+
 	if (adev->autodump.dumping.done) {
 		reinit_completion(&adev->autodump.dumping);
 		ret = 0;
 	} else {
 		ret = -EBUSY;
 	}
-	mutex_unlock(&adev->lock_reset);
+
+	up_read(&adev->reset_sem);
 
 	return ret;
 }
@@ -1242,7 +1246,9 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
 	}
 
 	/* Avoid accidently unparking the sched thread during GPU reset */
-	mutex_lock(&adev->lock_reset);
+	r = down_read_killable(&adev->reset_sem);
+	if (r)
+		return r;
 
 	/* hold on the scheduler */
 	for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
@@ -1269,7 +1275,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
 		kthread_unpark(ring->sched.thread);
 	}
 
-	mutex_unlock(&adev->lock_reset);
+	up_read(&adev->reset_sem);
 
 	pm_runtime_mark_last_busy(dev->dev);
 	pm_runtime_put_autosuspend(dev->dev);
@@ -1459,7 +1465,9 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
 		return -ENOMEM;
 
 	/* Avoid accidently unparking the sched thread during GPU reset */
-	mutex_lock(&adev->lock_reset);
+	r = down_read_killable(&adev->reset_sem);
+	if (r)
+		goto pro_end;
 
 	/* stop the scheduler */
 	kthread_park(ring->sched.thread);
@@ -1500,13 +1508,14 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
 	/* restart the scheduler */
 	kthread_unpark(ring->sched.thread);
 
-	mutex_unlock(&adev->lock_reset);
+	up_read(&adev->reset_sem);
 
 	ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
 
+pro_end:
 	kfree(fences);
 
-	return 0;
+	return r;
 }
 
 static int amdgpu_debugfs_sclk_set(void *data, u64 val)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 78fd2c9a7b7d..82242e2f5658 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	mutex_init(&adev->virt.vf_errors.lock);
 	hash_init(adev->mn_hash);
 	atomic_set(&adev->in_gpu_reset, 0);
-	mutex_init(&adev->lock_reset);
+	init_rwsem(&adev->reset_sem);
 	mutex_init(&adev->psp.mutex);
 	mutex_init(&adev->notifier_lock);
 
@@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev)
 	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
 		return false;
 
-	mutex_lock(&adev->lock_reset);
+	down_write(&adev->reset_sem);
 
 	atomic_inc(&adev->gpu_reset_counter);
 	switch (amdgpu_asic_reset_method(adev)) {
@@ -4229,7 +4229,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
 	amdgpu_vf_error_trans_all(adev);
 	adev->mp1_state = PP_MP1_STATE_NONE;
 	atomic_set(&adev->in_gpu_reset, 0);
-	mutex_unlock(&adev->lock_reset);
+	up_write(&adev->reset_sem);
 }
 
 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index f27d83f2de78..9c07014d9bd6 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -238,19 +238,15 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
 	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
 	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
-	int locked;
 
 	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
 	 * otherwise the mailbox msg will be ruined/reseted by
 	 * the VF FLR.
-	 *
-	 * we can unlock the lock_reset to allow "amdgpu_job_timedout"
-	 * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
-	 * which means host side had finished this VF's FLR.
 	 */
-	locked = mutex_trylock(&adev->lock_reset);
-	if (locked)
-		atomic_set(&adev->in_gpu_reset, 1);
+	if (!down_read_trylock(&adev->reset_sem))
+		return;
+
+	atomic_set(&adev->in_gpu_reset, 1);
 
 	do {
 		if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
@@ -261,10 +257,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 	} while (timeout > 1);
 
 flr_done:
-	if (locked) {
-		atomic_set(&adev->in_gpu_reset, 0);
-		mutex_unlock(&adev->lock_reset);
-	}
+	atomic_set(&adev->in_gpu_reset, 0);
+	up_read(&adev->reset_sem);
 
 	/* Trigger recovery for world switch failure if no TDR */
 	if (amdgpu_device_should_recover_gpu(adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 3cb10ab943a6..9c23abf9b140 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -259,19 +259,15 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
 	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
 	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
 	int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
-	int locked;
 
 	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
 	 * otherwise the mailbox msg will be ruined/reseted by
 	 * the VF FLR.
-	 *
-	 * we can unlock the lock_reset to allow "amdgpu_job_timedout"
-	 * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
-	 * which means host side had finished this VF's FLR.
 	 */
-	locked = mutex_trylock(&adev->lock_reset);
-	if (locked)
-		atomic_set(&adev->in_gpu_reset, 1);
+	if (!down_read_trylock(&adev->reset_sem))
+		return;
+
+	atomic_set(&adev->in_gpu_reset, 1);
 
 	do {
 		if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
@@ -282,10 +278,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
 	} while (timeout > 1);
 
 flr_done:
-	if (locked) {
-		atomic_set(&adev->in_gpu_reset, 0);
-		mutex_unlock(&adev->lock_reset);
-	}
+	atomic_set(&adev->in_gpu_reset, 0);
+	up_read(&adev->reset_sem);
 
 	/* Trigger recovery for world switch failure if no TDR */
 	if (amdgpu_device_should_recover_gpu(adev)
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* RE: [PATCH v3] drm/amdgpu: change reset lock from mutex to rw_semaphore
  2020-08-21  3:47 [PATCH v3] drm/amdgpu: change reset lock from mutex to rw_semaphore Dennis Li
@ 2020-08-21  6:10 ` Liu, Monk
  2020-08-21  6:25 ` Christian König
  1 sibling, 0 replies; 3+ messages in thread
From: Liu, Monk @ 2020-08-21  6:10 UTC (permalink / raw)
  To: Li, Dennis, amd-gfx, Deucher, Alexander, Kuehling, Felix, Zhang,
	Hawking, Koenig, Christian
  Cc: Li, Dennis

[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Monk Liu <monk.liu@amd.com>

_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Dennis Li <Dennis.Li@amd.com>
Sent: Friday, August 21, 2020 11:48 AM
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>
Cc: Li, Dennis <Dennis.Li@amd.com>
Subject: [PATCH v3] drm/amdgpu: change reset lock from mutex to rw_semaphore

clients don't need reset-lock for synchronization when no GPU recovery.

v2:
change to return the return value of down_read_killable.

v3:
if GPU recovery begin, VF ignore FLR notification.

Signed-off-by: Dennis Li <Dennis.Li@amd.com>

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c8aec832b244..ec11ed2a9ca4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -954,7 +954,7 @@ struct amdgpu_device {

 atomic_t in_gpu_reset;
 enum pp_mp1_state               mp1_state;
-struct mutex  lock_reset;
+struct rw_semaphore reset_sem;
 struct amdgpu_doorbell_index doorbell_index;

 struct mutexnotifier_lock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 79b397800cbc..cc5c7f81c540 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -101,14 +101,18 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)

 file->private_data = adev;

-mutex_lock(&adev->lock_reset);
+ret = down_read_killable(&adev->reset_sem);
+if (ret)
+return ret;
+
 if (adev->autodump.dumping.done) {
 reinit_completion(&adev->autodump.dumping);
 ret = 0;
 } else {
 ret = -EBUSY;
 }
-mutex_unlock(&adev->lock_reset);
+
+up_read(&adev->reset_sem);

 return ret;
 }
@@ -1242,7 +1246,9 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
 }

 /* Avoid accidently unparking the sched thread during GPU reset */
-mutex_lock(&adev->lock_reset);
+r = down_read_killable(&adev->reset_sem);
+if (r)
+return r;

 /* hold on the scheduler */
 for (i = 0; i < AMDGPU_MAX_RINGS; i++) { @@ -1269,7 +1275,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
 kthread_unpark(ring->sched.thread);
 }

-mutex_unlock(&adev->lock_reset);
+up_read(&adev->reset_sem);

 pm_runtime_mark_last_busy(dev->dev);
 pm_runtime_put_autosuspend(dev->dev);
@@ -1459,7 +1465,9 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
 return -ENOMEM;

 /* Avoid accidently unparking the sched thread during GPU reset */
-mutex_lock(&adev->lock_reset);
+r = down_read_killable(&adev->reset_sem);
+if (r)
+goto pro_end;

 /* stop the scheduler */
 kthread_park(ring->sched.thread);
@@ -1500,13 +1508,14 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
 /* restart the scheduler */
 kthread_unpark(ring->sched.thread);

-mutex_unlock(&adev->lock_reset);
+up_read(&adev->reset_sem);

 ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);

+pro_end:
 kfree(fences);

-return 0;
+return r;
 }

 static int amdgpu_debugfs_sclk_set(void *data, u64 val) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 78fd2c9a7b7d..82242e2f5658 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 mutex_init(&adev->virt.vf_errors.lock);
 hash_init(adev->mn_hash);
 atomic_set(&adev->in_gpu_reset, 0);
-mutex_init(&adev->lock_reset);
+init_rwsem(&adev->reset_sem);
 mutex_init(&adev->psp.mutex);
 mutex_init(&adev->notifier_lock);

@@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev)
 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
 return false;

-mutex_lock(&adev->lock_reset);
+down_write(&adev->reset_sem);

 atomic_inc(&adev->gpu_reset_counter);
 switch (amdgpu_asic_reset_method(adev)) { @@ -4229,7 +4229,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
 amdgpu_vf_error_trans_all(adev);
 adev->mp1_state = PP_MP1_STATE_NONE;
 atomic_set(&adev->in_gpu_reset, 0);
-mutex_unlock(&adev->lock_reset);
+up_write(&adev->reset_sem);
 }

 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index f27d83f2de78..9c07014d9bd6 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -238,19 +238,15 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
 struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
 int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
-int locked;

 /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
  * otherwise the mailbox msg will be ruined/reseted by
  * the VF FLR.
- *
- * we can unlock the lock_reset to allow "amdgpu_job_timedout"
- * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
- * which means host side had finished this VF's FLR.
  */
-locked = mutex_trylock(&adev->lock_reset);
-if (locked)
-atomic_set(&adev->in_gpu_reset, 1);
+if (!down_read_trylock(&adev->reset_sem))
+return;
+
+atomic_set(&adev->in_gpu_reset, 1);

 do {
 if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL) @@ -261,10 +257,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 } while (timeout > 1);

 flr_done:
-if (locked) {
-atomic_set(&adev->in_gpu_reset, 0);
-mutex_unlock(&adev->lock_reset);
-}
+atomic_set(&adev->in_gpu_reset, 0);
+up_read(&adev->reset_sem);

 /* Trigger recovery for world switch failure if no TDR */
 if (amdgpu_device_should_recover_gpu(adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 3cb10ab943a6..9c23abf9b140 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -259,19 +259,15 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
 struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
 struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
 int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
-int locked;

 /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
  * otherwise the mailbox msg will be ruined/reseted by
  * the VF FLR.
- *
- * we can unlock the lock_reset to allow "amdgpu_job_timedout"
- * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
- * which means host side had finished this VF's FLR.
  */
-locked = mutex_trylock(&adev->lock_reset);
-if (locked)
-atomic_set(&adev->in_gpu_reset, 1);
+if (!down_read_trylock(&adev->reset_sem))
+return;
+
+atomic_set(&adev->in_gpu_reset, 1);

 do {
 if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL) @@ -282,10 +278,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
 } while (timeout > 1);

 flr_done:
-if (locked) {
-atomic_set(&adev->in_gpu_reset, 0);
-mutex_unlock(&adev->lock_reset);
-}
+atomic_set(&adev->in_gpu_reset, 0);
+up_read(&adev->reset_sem);

 /* Trigger recovery for world switch failure if no TDR */
 if (amdgpu_device_should_recover_gpu(adev)
--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v3] drm/amdgpu: change reset lock from mutex to rw_semaphore
  2020-08-21  3:47 [PATCH v3] drm/amdgpu: change reset lock from mutex to rw_semaphore Dennis Li
  2020-08-21  6:10 ` Liu, Monk
@ 2020-08-21  6:25 ` Christian König
  1 sibling, 0 replies; 3+ messages in thread
From: Christian König @ 2020-08-21  6:25 UTC (permalink / raw)
  To: Dennis Li, amd-gfx, Alexander.Deucher, felix.kuehling,
	Hawking.Zhang, christian.koenig, monk.liu

Am 21.08.20 um 05:47 schrieb Dennis Li:
> clients don't need reset-lock for synchronization when no
> GPU recovery.
>
> v2:
> change to return the return value of down_read_killable.
>
> v3:
> if GPU recovery begin, VF ignore FLR notification.
>
> Signed-off-by: Dennis Li <Dennis.Li@amd.com>

Acked-by: Christian König <christian.koenig@amd.com>

>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index c8aec832b244..ec11ed2a9ca4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -954,7 +954,7 @@ struct amdgpu_device {
>   
>   	atomic_t 			in_gpu_reset;
>   	enum pp_mp1_state               mp1_state;
> -	struct mutex  lock_reset;
> +	struct rw_semaphore reset_sem;
>   	struct amdgpu_doorbell_index doorbell_index;
>   
>   	struct mutex			notifier_lock;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 79b397800cbc..cc5c7f81c540 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -101,14 +101,18 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
>   
>   	file->private_data = adev;
>   
> -	mutex_lock(&adev->lock_reset);
> +	ret = down_read_killable(&adev->reset_sem);
> +	if (ret)
> +		return ret;
> +
>   	if (adev->autodump.dumping.done) {
>   		reinit_completion(&adev->autodump.dumping);
>   		ret = 0;
>   	} else {
>   		ret = -EBUSY;
>   	}
> -	mutex_unlock(&adev->lock_reset);
> +
> +	up_read(&adev->reset_sem);
>   
>   	return ret;
>   }
> @@ -1242,7 +1246,9 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
>   	}
>   
>   	/* Avoid accidently unparking the sched thread during GPU reset */
> -	mutex_lock(&adev->lock_reset);
> +	r = down_read_killable(&adev->reset_sem);
> +	if (r)
> +		return r;
>   
>   	/* hold on the scheduler */
>   	for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
> @@ -1269,7 +1275,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
>   		kthread_unpark(ring->sched.thread);
>   	}
>   
> -	mutex_unlock(&adev->lock_reset);
> +	up_read(&adev->reset_sem);
>   
>   	pm_runtime_mark_last_busy(dev->dev);
>   	pm_runtime_put_autosuspend(dev->dev);
> @@ -1459,7 +1465,9 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
>   		return -ENOMEM;
>   
>   	/* Avoid accidently unparking the sched thread during GPU reset */
> -	mutex_lock(&adev->lock_reset);
> +	r = down_read_killable(&adev->reset_sem);
> +	if (r)
> +		goto pro_end;
>   
>   	/* stop the scheduler */
>   	kthread_park(ring->sched.thread);
> @@ -1500,13 +1508,14 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
>   	/* restart the scheduler */
>   	kthread_unpark(ring->sched.thread);
>   
> -	mutex_unlock(&adev->lock_reset);
> +	up_read(&adev->reset_sem);
>   
>   	ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
>   
> +pro_end:
>   	kfree(fences);
>   
> -	return 0;
> +	return r;
>   }
>   
>   static int amdgpu_debugfs_sclk_set(void *data, u64 val)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 78fd2c9a7b7d..82242e2f5658 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	mutex_init(&adev->virt.vf_errors.lock);
>   	hash_init(adev->mn_hash);
>   	atomic_set(&adev->in_gpu_reset, 0);
> -	mutex_init(&adev->lock_reset);
> +	init_rwsem(&adev->reset_sem);
>   	mutex_init(&adev->psp.mutex);
>   	mutex_init(&adev->notifier_lock);
>   
> @@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev)
>   	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
>   		return false;
>   
> -	mutex_lock(&adev->lock_reset);
> +	down_write(&adev->reset_sem);
>   
>   	atomic_inc(&adev->gpu_reset_counter);
>   	switch (amdgpu_asic_reset_method(adev)) {
> @@ -4229,7 +4229,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>   	amdgpu_vf_error_trans_all(adev);
>   	adev->mp1_state = PP_MP1_STATE_NONE;
>   	atomic_set(&adev->in_gpu_reset, 0);
> -	mutex_unlock(&adev->lock_reset);
> +	up_write(&adev->reset_sem);
>   }
>   
>   static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index f27d83f2de78..9c07014d9bd6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -238,19 +238,15 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
>   	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
>   	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
> -	int locked;
>   
>   	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>   	 * otherwise the mailbox msg will be ruined/reseted by
>   	 * the VF FLR.
> -	 *
> -	 * we can unlock the lock_reset to allow "amdgpu_job_timedout"
> -	 * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
> -	 * which means host side had finished this VF's FLR.
>   	 */
> -	locked = mutex_trylock(&adev->lock_reset);
> -	if (locked)
> -		atomic_set(&adev->in_gpu_reset, 1);
> +	if (!down_read_trylock(&adev->reset_sem))
> +		return;
> +
> +	atomic_set(&adev->in_gpu_reset, 1);
>   
>   	do {
>   		if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
> @@ -261,10 +257,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   	} while (timeout > 1);
>   
>   flr_done:
> -	if (locked) {
> -		atomic_set(&adev->in_gpu_reset, 0);
> -		mutex_unlock(&adev->lock_reset);
> -	}
> +	atomic_set(&adev->in_gpu_reset, 0);
> +	up_read(&adev->reset_sem);
>   
>   	/* Trigger recovery for world switch failure if no TDR */
>   	if (amdgpu_device_should_recover_gpu(adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index 3cb10ab943a6..9c23abf9b140 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -259,19 +259,15 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>   	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
>   	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
>   	int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
> -	int locked;
>   
>   	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>   	 * otherwise the mailbox msg will be ruined/reseted by
>   	 * the VF FLR.
> -	 *
> -	 * we can unlock the lock_reset to allow "amdgpu_job_timedout"
> -	 * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
> -	 * which means host side had finished this VF's FLR.
>   	 */
> -	locked = mutex_trylock(&adev->lock_reset);
> -	if (locked)
> -		atomic_set(&adev->in_gpu_reset, 1);
> +	if (!down_read_trylock(&adev->reset_sem))
> +		return;
> +
> +	atomic_set(&adev->in_gpu_reset, 1);
>   
>   	do {
>   		if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
> @@ -282,10 +278,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>   	} while (timeout > 1);
>   
>   flr_done:
> -	if (locked) {
> -		atomic_set(&adev->in_gpu_reset, 0);
> -		mutex_unlock(&adev->lock_reset);
> -	}
> +	atomic_set(&adev->in_gpu_reset, 0);
> +	up_read(&adev->reset_sem);
>   
>   	/* Trigger recovery for world switch failure if no TDR */
>   	if (amdgpu_device_should_recover_gpu(adev)

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-08-21  6:25 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-21  3:47 [PATCH v3] drm/amdgpu: change reset lock from mutex to rw_semaphore Dennis Li
2020-08-21  6:10 ` Liu, Monk
2020-08-21  6:25 ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.